;;**************************************
;;  procs_a.inc file. Some procedures **
;;**************************************
;;  procedures list:
;;
;;  apply_displacement_from_tex
;;  bar_tri_area
;;  blur_screen
;;  calc_bumpmap
;;  calc_bumpmap_coords
;;  calc_tri
;;  call_thread
;;  check_dist
;;  clip_triangles
;;  del_tiv_ie_without
;;  do_color_buffer
;;  do_deformation_normals_depend
;;  do_edges_list
;;  do_mandel_tex
;;  draw_triangulize_rect
;;  fix_normals
;;  from_tex
;;  generate_texturel
;;  generate_texture2
;;  generate_texture3
;;  get_min_max_vert
;;  init_envmap2
;;  init_point_lights
;;  init_s_tex
;;  load_tetrahedron
;;  make_random_lights
;;  mandel
;;  mandel_th
;;  mark_inner_vert
;;  mark_inner_vert_th
;;  move_texture
;;  normalize_object
;;  random
;;  rem_cracks
;;  remove_redundant_vert_ch  ; merge verts
;;  rm_inner_faces
;;  rm_inner_faces_th
;;  xy_in_rect

;calculating triangle parameters
; in:
;    edi = base addres
;    eax, ebx, edx = vertices addresses, I assume
;    all are factored by 12 number
; out:
;    xm0 - xm2 - vectors
;    xm3 - xm5 - normalized direction vects
;    xm6 - lenghts of edges as follows lo->hi edx-eax, ab, db;
;          hgst float = height of triangle vect prependicular
;          to eax-edx edge
;    xm7 hgst dword = area of triangle
; changes -> all xmmXX registers
;         -> none general
calc_tri:
   push   ebp
   mov    ebp,esp
   sub    esp,40
   .zer   equ [ebp-8]
   .pol   equ [ebp-16]
   .ffa3  equ [ebp-24]
   .ffa2  equ [ebp-28]
   .ffa1  equ [ebp-32]
   .area1 equ [ebp-36]
   .area2 equ [ebp-40]

   push   eax edi
   movups xmm1,[eax+edi]
   xorps  xmm7,xmm7
   movups xmm3,[ebx+edi]
   movups xmm0,[edx+edi]
   movlps .zer,xmm7
   movaps xmm2,xmm0
   subps  xmm0,xmm1 ; xm1 = edx - eax
   subps  xmm1,xmm3 ; xm2 = eax - ebx
   subps  xmm2,xmm3 ; xm3 = edx - ebx
   movaps xmm3,xmm0
   movaps xmm4,xmm1
   movaps xmm5,xmm2
   dpps   xmm3,xmm3,01110001b  ; edx - eax
   dpps   xmm4,xmm4,01110010b  ; eax - ebx
   dpps   xmm5,xmm5,01110100b  ; edx - ebx

   movaps xmm6,xmm3
   orps   xmm6,xmm4
   orps   xmm6,xmm5
   sqrtps xmm6,xmm6
   ; xm6= lo -> hi edx - eax, eax - ebx, edx - ebx  => edges lenghts
   movaps xmm3,xmm0
   movaps xmm4,xmm1
   movaps xmm5,xmm2
   rcpps  xmm7,xmm6
   shufps xmm7,xmm7,11000000b
   mulps  xmm3,xmm7
   rcpps  xmm7,xmm6
   shufps xmm7,xmm7,11010101b
   mulps  xmm4,xmm7
   rcpps  xmm7,xmm6
   shufps xmm7,xmm7,11101010b
   mulps  xmm5,xmm7
   ; xm3 = edx - eax normalized
   ; xm5 = edx - ebx norm
   mov     eax,zero_hgst
   mov     edi,f05x3
   movaps  xmm7,xmm6
   andps   xmm7,[eax]
   haddps  xmm7,xmm7
   haddps  xmm7,xmm7
   mulps   xmm7,[edi]
   movlps  .pol,xmm7     ; pol = p in Heron eqation
   subps   xmm7,xmm6
   movlps  .area2,xmm7
   movups  .ffa1,xmm6   ; Heron eq
   movhlps xmm7,xmm6    ; p=1/2*(a+b+c)
   mulps   xmm7,xmm6    ; area= sqrt(p*(p-a)*(p-b)*(p-c))
   mulss   xmm7,.area1  ; a,b,c = lenght of triangle edges
   mulss   xmm7,.pol
   sqrtps  xmm7,xmm7
   movlps  .pol,xmm7
   ; Heron equation xm7 area of tri
   rcpss  xmm6,.ffa1  ;.x_tmp
   ; xm6 = rcp of da lenght
   mulps   xmm7,xmm6
   mulps   xmm7,[edi]

   movups  xmm6,.ffa1
   andps   xmm6,[eax]
   movhps  xmm7,.zer
   shufps  xmm7,xmm7,00111111b
   orps    xmm6,xmm7
   ; as final I calc ratio as indicator if tri is degenerated
   movhlps xmm7,xmm6
   divss   xmm7,.ffa2
   movhps  xmm7,.pol
   shufps  xmm7,xmm7,10110100b
   pop     edi eax
   mov     esp,ebp
   pop     ebp
ret
;=====================================================================
check_dist:
; proc checks minmal distance to boundary edge in triangles fan
; in: esi = list of edges around base vertex
;     ecx = boundary tris number
;     ebx = base vertex
;     edi = ptr to vertices list
; out:
;     xmm0 = minimal distance to edges boundary (min height of tri)
;     xmm1 = minmal (lenght of interior) edge index as dword integer
; destroys:
;     all xmmXX registers
;     general registers: none
    push     ebp
    mov      ebp,esp
    sub      esp,40
    .min_h   equ [ebp-8]
    .min_ed  equ [ebp-16]
    .min_in  equ [ebp-24]

    push     eax ebx edx esi edi
    mov      eax,100000.0
    mov      dword .min_h,eax   ; big value
    mov      dword .min_ed,eax
;   pcmpeqd  xmm0,xmm0
;   movlps   .min_in,xmm0
    ; i assume ebx was factored previously by 12 number
    cld
  .ll:
    lodsd
    xchg     eax,edx
    lodsd
    push     edx eax
    imul     eax,12
    imul     edx,12
    call     calc_tri
    movlps   xmm1,.min_ed
    shufps   xmm1,xmm1,0
    movaps   xmm4,xmm6
    movhlps  xmm3,xmm6
    shufps   xmm4,xmm4,11111001b
    movaps   xmm0,xmm4
    comiss   xmm4,xmm3
    jb       @f
    movhlps  xmm4,xmm6
  @@:
    shufps   xmm4,xmm4,0
    minps    xmm1,xmm4
    movlps   .min_ed,xmm1
    movlps   xmm5,[esp]
    cmpeqps  xmm1,xmm0
    movmskps eax,xmm1
    and      eax,11
    or       eax,eax
    jz       @f
;    andps    xmm1,xmm5
    bsf      eax,eax
    shl      eax,2
    mov      eax,[esp+eax]
    mov      .min_in,eax
  @@:
    add      esp,8
    movlps   xmm0,.min_h
    shufps   xmm6,xmm6,00100111b
    minps    xmm0,xmm6
    movlps   .min_h,xmm0
    loop     .ll
    movlps   xmm1,.min_in
    pop      edi esi edx ebx eax
    mov      esp,ebp
    pop      ebp
ret
;=============================================================
;=============================================================
; "Proc try optimize object by change every triangles geometry
; Try cure situation when triangles are thin and long.."
; This main feature dont exist now..
; Possibility to fill triangle holes in mesh.
opt_object2:
; in:
;    al = 'h'   - hole filling function
;    al = 'm'   - improve geometry of mesh |chges geom but not improve|
;    al = 'e'   - try collapse edges - now this option make chosen
;                 edges significant shorter
;    al = 'c'   - remove clenched triangles, clench definition inside code
;    If bl = 'c' than edx = chunk number to be affected during
;    proc execution.

    push     ebp
    mov      ebp,esp
    and      ebp,-16
    sub      esp,232
    sub      ebp,64

    .tripled_verts_ptr      equ dword [ebp-4]
    .curr_tripled           equ dword [ebp-8]
    .base_v                 equ dword [ebp-12]
    .ratio                  equ dword [ebp-16]
    .piv                    equ dword [ebp-24]
    .piv2                   equ dword [ebp-32]
    .max_piv                equ dword [ebp-36]

    .triangles_normals_ptr  equ       [ebp-64]  ; \
    .tch                    equ dword [ebp-60]  ;  \   ; triangles in chunk
    .points_normals_ptr     equ       [ebp-56]  ;  |
    .points_count_var       equ       [ebp-52]  ;  |
    .triangles_count_var    equ       [ebp-48]  ;  |
    .triangles_count_var    equ dword [ebp-48]  ;  |
    .points_r_ptr           equ       [ebp-44]  ;  | > dont change order
    .triangles_ptr          equ       [ebp-40]  ;  /

    .h                      equ       [ebp-68]
    .Ed_DBl                 equ       [ebp-72]
    .Ed_ABl                 equ       [ebp-76]
    .Ed_DAl                 equ       [ebp-80]
    .step                   equ       [ebp-96]
    .old_bas                equ       [ebp-112]
    .ed_cnt                 equ dword [ebp-116]
    .glob_h                 equ       [ebp-120]
    .holes_tri_ptr          equ dword [ebp-124]
    .mark_op                equ byte  [ebp-125] ; operation marker
    .ch                     equ byte  [ebp-126] ; if = 'c' means check chunks mode

    .thresh2                equ       [ebp+16]
    .holes_cnt              equ dword [ebp+12]
    .tripled_counter        equ dword [ebp+8]
    .new_tri_ptr            equ       [ebp+4]
    .thresh1                equ       [ebp]

    .step_fact              equ       [ebp+20]
    .sort_ed                equ       [ebp+24]
    .nx_cnt                 equ dword [ebp+28]
    .Ch_no                  equ dword [ebp+32] ; chunk to process number
    .chunks_ptr             equ dword [ebp+36]

    mov      .mark_op,al
    mov      .thresh1, dword  0.67 ; ratio to determine
    mov      .thresh2, dword  1.5  ; if opt process will be launched
    mov      .ch,bl
    mov      esi,[chunks_ptr]
    mov      .Ch_no,edx

    mov      .step_fact,dword 0.05
    xor      ecx,ecx
    mov      .holes_cnt,ecx
    mov      .chunks_ptr,esi
    lea      esi,[triangles_normals_ptr]
    lea      edi,.triangles_normals_ptr
    mov      ecx,7
    cld
    rep      movsd
    mov      eax,.triangles_count_var
    mov      .tch,eax
    cmp      .mark_op,'c'
    jne      @f
    xor      eax,eax
    mov      .tripled_counter,eax
    mov      eax,2000 ; max 2000 tripled verts
;   mov      .max_tripled,eax
    shl      eax,3
    malloc   eax
    mov      .tripled_verts_ptr,eax
    mov      .curr_tripled,eax
  @@:
    cmp      .mark_op,'m'
    je       @f
 ;   cmp      .mark_op,'e'
 ;   jne      @f
    mov      eax,5000 * 12 + 200
    malloc   eax
    ; max 5000 filled holes
    mov      .holes_tri_ptr,eax
  @@:
    mov      ebx,.triangles_count_var
    add      ebx,15
    imul     ebx,24
    malloc   ebx
    mov      .piv,eax
    add      eax,ebx
    sub      eax,15*24
    ; substract margin
    mov      .max_piv,eax
    malloc   ebx
    mov      .piv2,eax
    ;*******************
    ; pivot below ******
    mov      esi,.triangles_ptr       ; Making pivot table
    mov      edi,.piv                 ; obligatory element of acceleration structure
    xor      eax,eax                  ; in many procs.
    mov      ecx,.triangles_count_var ; Thing to isolate and transform into separate procedure..
    cld
  .piv_lo:
    movsd
    stosd
    movsd
    stosd
    movsd
    stosd
    add      ebx,2
    inc      eax
    loop     .piv_lo

    mov      edi,.piv2
    mov      esi,.piv
 ;   movups   xmm2,[esi]
 ;   movups   xmm3,[edi]
    mov      ecx,.tch  ;.triangles_count_var
    lea      ecx,[ecx*3]
    call     sort_hybrid
if 0
    mov      edi,.piv
    mov      esi,.piv3      ; make  list of incidence
    mov      ecx,.triangles_count_var
    lea      ecx,[ecx*3]
    mov      eax,esi
    cld
    stosd
  .tri_lo:                  ; every vert index
    lodsd                   ; matched with triangle desc in piv2 list
    cmp      eax,[esi+4]
    je       @f
    xchg     eax,esi
    add      eax,4
    stosd
    xchg     eax,esi
   @@:
    add      esi,4
    loop     .tri_lo
end if
    ; pivot above **************
    ;***************************
    cld
    mov      esi,.piv
  .nx_ver:
    lodsd
    ; load vert index
    cmp      esi,.max_piv
    jnb      .end
    ; eax - curr vert index
    mov      ebx,eax
    mov      .base_v,eax
    ; eax = base vertex index
    ; first check if base vertex is not boundary..
    ; store edges from tris fan in .piv
    xor      ecx,ecx
    mov      .ed_cnt,ecx
    mov      .nx_cnt,ecx
    mov      edi,.piv2
  .lab_ed:
    lodsd
    cmp      .ch,'c'  ; check chunks mode ??
    jne      @f
    mov      edx,eax
    add      edx,edx
    add      edx,.chunks_ptr
    movzx    edx,word[edx]
    cmp      edx,.Ch_no
    jne      .check_edges
  @@:
    ; is still the same base index ?
    ; cmp base vert index and next vert index
    inc      ecx
    mov      .ed_cnt,ecx

    ; eax - tri index
    imul     eax,12
    add      eax,.triangles_ptr
    mov      edx,[eax]
    cmp      edx,ebx
    je       @f
    inc      .nx_cnt
    ; ebx <-> edx = edge1
    push     edx
  @@:
    mov      edx,[eax+4]
    cmp      edx,ebx
    je       @f
    inc      .nx_cnt
    push     edx
  @@:
    mov      edx,[eax+8]
    cmp      edx,ebx
    je       @f
    inc      .nx_cnt
    push     edx
  @@:
    pop      eax
    stosd
    pop      eax
    stosd
  .nxEd:
    cmp      [esi],ebx
    ; is still the same base index ?
    ; cmp base vert index and next vert index
    jne      .check_edges

    add      esi,4
    jmp      .lab_ed
    ;
    ; keep in mind -> you may get degenerted/false value lower than !!ecx/2!!
    ; if fan/star of triangles is degenerated - ebx base vert
    ; is manifold chunk boundary...
  .check_edges:
    ; As intial step - copy and sort all values from .piv2
    ; If all values are doubled, means OK, tri fan is comlpeted
    ; base ebx vert is not placed on manifold chunk boundary.
    mov      ecx,.ed_cnt
    cmp      ecx,2
    jna      .nx_ver
    mov      ecx,.nx_cnt
    cmp      ecx,4
    jna      .nx_ver

    ; too small - degenerated fan
    ; you may check if all vert indices are paired  =
    ; = if yes, means fan is inside manifold chunk
    ; But i can image situation when it is still degenerated
    ; (even more seriously)
    push     esi ebx

    push     edi ebx
    add      edi,16
    mov      .sort_ed,edi
    mov      esi,.piv2
    mov      ecx,.nx_cnt
    cld
    rep      movsd
    mov      ecx,.nx_cnt
    add      edi,16
    mov      esi,.sort_ed
    call     sort_hybrid_dd

    mov      esi,.sort_ed
    mov      ecx,.nx_cnt
    cld
    xor      edx,edx
  .ll_in:
       movlps   xmm0,[esi]    ;
    lodsd
    cmp      .mark_op,'c'
    jne      .no_clench
       movlps   xmm1,[esi]    ;
       shufps   xmm0,xmm0,0   ;
       pcmpeqd  xmm0,xmm1     ;
       movmskps edi,xmm0      ;
       and      edi,11b       ;
       cmp      edi,11b       ;
       ; vertex 'eax' is tripled, contains  ;
       ; clenched tri                       ;
       jne      @f                          ;
       cmp      .tripled_counter,1990       ;
       ja       @f
       mov      ebx,.base_v
       mov      edi,.curr_tripled           ;
       stosd                                ;
       xchg     eax,ebx
       stosd                                ; store whole edge
       xchg     eax,ebx
       inc      .tripled_counter            ;
       mov      .curr_tripled,edi

       ; tris with clench possibility:                ;
       ;  -tripled vert index, every edge single      ;
       ;    not sure if this visable here in tri fan  ;
       ;    ?alone vert indicator in tri fan?         ;
       ;    so .no_inner: cause                       ;
       ;  -one edge tripled, two are single           ;
       ;    this show tripled vert in fan             ;
       ;  -two tripled edges, one single              ;
       ;    this show tr. v. in f.                    ;
       ;  -three tripled edges = tri duplicated       ;
       ;    this show tr. v. in f.                    ;
   @@:
    loop     .ll_in
    add      esp,8
    pop      ebx esi
    jmp      .nx_ver
   .no_clench:
    cmp      [esi],eax
    jne      .no_inner
    dec      ecx
    add      esi,4
    jecxz    .chck_done
    loop     .ll_in
    jmp      .chck_done
  .no_inner:
    inc      edx  ; non coupled index of vertex
    push     eax
    loop     .ll_in
  .chck_done:
    cmp      edx,2
    jne      .tidy_stack
    cmp      .mark_op,'h'
    jne      .tidy_stack
    cmp      .holes_cnt,4500
    ja       .tidy_stack
    ; !! **** filing holes code below ****
    ; when edx = 2 -> u may search through tri list for hole
    ; such tri should have two indices - u have on stack
    ; 3rd index is NOT equal "ebx"
    movlps   xmm0,[esp]

    push     ecx
    xor      ebx,ebx
    mov      ecx,.holes_cnt
    or       ecx,ecx             ;  check if tri already exist
    jz       .no_search          ;  on .holes_tri list
    mov      edi,.holes_tri_ptr
   @@:
    movss    xmm1,.base_v
    movaps   xmm2,xmm0
    movaps   xmm3,xmm0
    shufps   xmm1,xmm1,0
    shufps   xmm2,xmm2,0
    shufps   xmm3,xmm3,01010101b

    movups   xmm4,[edi]
    movaps   xmm5,xmm4
    movaps   xmm6,xmm4
    pcmpeqd  xmm4,xmm1
    pcmpeqd  xmm5,xmm2
    pcmpeqd  xmm6,xmm3
    orps     xmm4,xmm5
    orps     xmm4,xmm6
    movmskps eax,xmm4
    and      eax,111b
    popcnt   eax,eax
    cmp      eax,3
    cmove    ebx,eax
    add      edi,12
    loop     @b
  .no_search:
    pop      ecx
    cmp      ebx,3
    je       .tidy_stack

    mov      eax,.holes_cnt
    imul     eax,12
    add      eax,.holes_tri_ptr
    movlps   [eax],xmm0    ; fix hole if needed
    mov      ebx,.base_v
    mov      [eax+8],ebx
    inc      .holes_cnt
    ; !! **** filing holes code above ****
  .tidy_stack:
    shl      edx,2
    add      esp,edx
    pop      ebx edi
    or       edx,edx
    jnz      .en_chck_ed
    cmp      .mark_op,'e'
    jne      .topology
    mov      ecx,.ed_cnt
    mov      esi,.piv2
    ; in this place:: call to check_dist and find shortest edge to collapse
    imul     ebx,12
    mov      edi,.points_r_ptr
    call     check_dist

    movd     eax,xmm1
    imul     eax,12
    mov      edi,.points_r_ptr
    movups   xmm2,[edi+eax]
    movups   xmm3,[edi+ebx]
    addps    xmm3,xmm2
    mulps    xmm3,[f05x3]
    movlps   [edi+ebx],xmm3
    movhlps  xmm3,xmm3
    movss    [edi+ebx+8],xmm3

    mov      ebx,.base_v
    mov      eax,.holes_cnt
    cmp      eax,6000
    ja       @f
    shl      eax,3
    add      eax,.holes_tri_ptr
    mov      [eax],edx      ; index to xchg
    mov      [eax+4],ebx    ; base index
    inc      .holes_cnt
  @@:
    jmp      .en_chck_ed
    ; jmp to do next tri fan, do research
  .topology:
    mov      ecx,.ed_cnt
  .nx_iter:
;    push     ecx
;    mov      esi,.piv2
;    mov      ecx,.ed_cnt
;    call     check_dist
   ; xmm0 - minimal fan distance (base - edges boundary)
;    movss    .glob_h,xmm0
;    pop      ecx
;    mov      ecx,.ed_cnt
    mov      esi,.piv2
    imul     ebx,12
    cmp      .mark_op,'h' ; non modify geometry when
    je       .en_chck_ed  ; filling holes feature
  @@:
  .nx_edg:
    dec      ecx
    or       ecx,ecx
    jz       .en_chck_ed
    ; in .piv2 = list of all edges from tri fan/umbrella/
    ; for each pair of edges - check triangle ratio
    lodsd
    xchg     eax,edx
    lodsd
    mov      edi,.points_r_ptr
    push     esi ecx
    mov      esi,.piv2
    mov      ecx,.ed_cnt
    call     check_dist
    ; xmm0 - minimal fan distance (base - edges boundary)
    movss    .h,xmm0
    pop      ecx esi
    push     eax edx ebx
    imul     eax,12
    imul     edx,12
    call     calc_tri          ; curr tri params
    movlps   xmm0,.h
    movaps   .Ed_DAl,xmm6
    movss    .h,xmm0
    ; xm6 = lo->hi lenghts edx-eax, eax-ebx, edx-ebx, h
    pop      ebx edx eax
    movss    .ratio,xmm7
    shufps   xmm7,xmm7,0
    ; ratio, check threshold variables ;      ebx
                                       ;     /  |  \
    movaps   xmm6,xmm7                 ;    /   |     \
    cmpltps  xmm6,.thresh1             ;   /    | h      \       ; ebx => base tri fan idex
    cmpltps  xmm7,.thresh2             ;  /     |           \    ; eax -- edx => boundary ed
    xorps    xmm7,xmm6                 ; eax---------------- edx ; all vert indices -- clear / divided by 12
    movmskps edi,xmm7
    bt       edi,0
    jnc      .try_modify
    jmp      @b
  .en_chck_ed:
;   push     ecx
;   mov      esi,.piv2
;   mov      ecx,.ed_cnt
;   call     check_dist
    ; xmm0 - minimal fan distance (base - edges boundary)
;   movss    .glob_h,xmm0
;   pop      ecx
;   comiss   xmm0,.glob_h
;   ja       .nx_iter
    pop      ebx esi
    jmp      .nx_ver    ; jmp to do next tri fan, do research
  .try_modify:          ; xm4 d - b dir vect
    movlps   xmm0,.Ed_ABl
    movlps   xmm2,.Ed_DBl
    comiss   xmm2,xmm0  ; xm0 len= a-b, d-b, .. xm2= d-b
    ja       @f
    ; search for longest ed from a-b, d-b
    movaps   xmm2,xmm0
    xchg     eax,edx
    movaps   xmm5,xmm4
  @@:
    ; xm5 = dir vect of longest from lo->hi: b-d, b-a edges
    ; decrease ebx <--> edx edge len
    ; xm5 = direction vect of edx<-->ebx longest ed
    ; xm2 = lowest float - longest ed lenght
    movss    .Ed_DBl,xmm2
    movlps   xmm6,.step_fact
    shufps   xmm6,xmm6,0
    mulps    xmm6,xmm5
    ; xm6  = direction step, multiply it by len of d-b edge
    shufps   xmm2,xmm2,0
    mulps    xmm6,xmm2
    movaps   .step,xmm6
    mov      edi,.points_r_ptr
    imul     edx,12
    imul     eax,12
    push     ecx
    mov      ecx,8
  .march:
    movups   xmm1,[edi+ebx]
    movaps   .old_bas,xmm1
    ; go along edx ebx edge
    addps    xmm1,.step
    movhlps  xmm7,xmm1
    movlps   [edi+ebx],xmm1
    movss    [edi+ebx+8],xmm7
    ; check if len decreased
    movups   xmm4,[edi+edx]
    subps    xmm4,xmm1
    dpps     xmm4,xmm4,01110111b
    sqrtps   xmm4,xmm4
    comiss   xmm4,.Ed_DBl
    jb       @f
    movaps   xmm6,.step
    xorps    xmm6,[sign_mask]
    addps    xmm1,xmm6
    movaps   .step,xmm6
  @@:
    movhlps  xmm7,xmm1
    movlps   [edi+ebx],xmm1
    movss    [edi+ebx+8],xmm7
    ; check h = height of triangle = min distance to edge boundary
    push     esi ecx
    mov      esi,.piv2
    mov      ecx,.ed_cnt
    call     check_dist
    pop      ecx esi
    movlps   xmm5,.h
    ; movaps   .Ed_DAl,xmm6
    ; shufps   xmm0,xmm0,11111111b
    movss    .h,xmm0
    comiss   xmm0,xmm5
    ; check if distance to boundary edge (.h) decreased
    ; if yes - abandon changes
    ; as routine result ->
    ;  -> BD edge len. should decreased, .h should increased
    jna      @f   ;.march
    dec      ecx
    jnz      .march
  ;  loop     .march
    pop      ecx
    jmp      .nx_edg
  @@:
    ; abandon changes
    movaps   xmm1,.old_bas
    movhlps  xmm0,xmm1
    movlps   [edi+ebx],xmm1
    movss    [edi+ebx+8],xmm0
    pop      ecx
    jmp      .nx_edg
  .end:
 ;   cmp      .mark_op,'e'
 ;   jne      .en3
    ;
    ; agregate changes using info .holes_tri_ptr
    ; In this list first dword = index to xchg with sec dword index
  .en3:
    cmp      .mark_op,'h'
    jne      .en2
    mov      ecx,.holes_cnt
    or       ecx,ecx
    jz       .en2
    mov      ebx,.triangles_count_var
    add      ebx,ecx
    push     ecx
    add      ebx,100
    imul     ebx,12
    malloc   ebx
    mov      .new_tri_ptr,eax
    pop      ecx
    mov      edi,eax
    mov      esi,.holes_tri_ptr
    cld
    lea      ecx,[ecx*3]
    rep      movsd
    mov      esi,.triangles_ptr
    mov      ecx,.triangles_count_var
    lea      ecx,[ecx*3]
    rep      movsd
    ; I hope, some holes are filled
    ; tri list was just updated
    mov      ecx,.holes_cnt
    add      ecx,.triangles_count_var
    mov      [triangles_count_var],ecx
    mfree    [triangles_ptr]
    push     dword .new_tri_ptr
    pop      [triangles_ptr]
  .en2:
    ; *************************
    ; *************************
    ; remove clenched triangles
    ; *************************
    ; *************************
    cmp      .mark_op,'c'           ; todo:
    jne      .no_rc                 ;  - maybye non brute force way?
    xor      eax,eax                ;  - condition to leave some tris intact
    cmp      .tripled_counter,eax   ;
    jna      .no_rc
    cld
    mov      esi,.tripled_verts_ptr
    mov      ecx,.tripled_counter
  .nx_tripled_ed:
    push     ecx
    movlps   xmm0,[esi]  ; esi = two verts indices equals tripled edge
    movaps   xmm1,xmm0
    shufps   xmm0,xmm0,0
    shufps   xmm1,xmm1,01010101b
    add      esi,8
    push     esi
    mov      esi,.triangles_ptr
    mov      ecx,.triangles_count_var
  .lrm:
    push     ecx
    movaps   xmm2,xmm0
    mov      edi,esi         ; edi = base tri
    movaps   xmm3,xmm1
    movups   xmm4,[esi]
    pcmpeqd  xmm2,xmm4
    add      esi,12
    push     esi
    pcmpeqd  xmm3,xmm4
    orps     xmm2,xmm3
    movmskps eax,xmm2
    and      eax,111b
    popcnt   ebx,eax
    cmp      ebx,2
    je       .ser_others
  .nx_lrma:
    pop      esi
    pop      ecx
    loop     .lrm
  .nx_t_edge:
    pop      esi
    pop      ecx
    loop     .nx_tripled_ed
    jmp      .end_rem_clenched
  .ser_others:
    ; search for other edges
    ; if are 1. all 3 tripled
    ;        2. both other two single
    ;        3. one other single
    movaps   xmm5,xmm4
    not      eax
    bsf      ecx,eax
    jecxz    .shifted
  @@:
    psrldq   xmm5,4
    loop     @b

  .shifted:
    movaps   xmm6,xmm5

    shufps   xmm6,xmm6,0


    xor      edx,edx
    mov      esi,.triangles_ptr
    mov      ecx,.triangles_count_var
  .l_other:
    movaps   xmm2,xmm0

    movaps   xmm4,xmm1
    movaps   xmm3,xmm6

    movups   xmm7,[esi]
    add      esi,12
    pcmpeqd  xmm2,xmm7
    pcmpeqd  xmm3,xmm7
    pcmpeqd  xmm4,xmm7
    orps     xmm2,xmm3
    movmskps eax,xmm2
    and      eax,111b
    popcnt   eax,eax
    cmp      eax,2
    jne      @f
    inc      dh  ; inc edge incidence counter
                 ; => I found other edge
  @@:
    orps     xmm4,xmm3
    movmskps eax,xmm4
    and      eax,111b
    popcnt   eax,eax
    cmp      eax,2
    jne      @f
    inc      dl
  @@:
    loop     .l_other
    cmp      dx,0x0303
    ; two other edges tripled or have more copies
    jae      @f
    cmp      dl,1 ; single edge
    je       @f
    cmp      dh,1 ;
    jne      .nx_lrma
  @@:
    ; two other edges are single => mark tri to rem
    mov      eax,.points_count_var
    add      eax,2
    ; 'safe' value
    cld
    stosd
    stosd
    stosd
    ; time to next tripled edge
    add      esp,8
    jmp      .nx_t_edge
    ; find next other edges
  .end_rem_clenched:
    mfree    .tripled_verts_ptr
  .no_rc:
    ; ......

    mfree    .holes_tri_ptr
    mfree    .piv
    mfree    .piv2
    add      esp,232
    pop      ebp
ret
;============================================================
from_tex:
     ; converts texture bitmap to 3d object
     ; bit otherwise than pixel<=>voxel
     ; I tried decrease elements number

     push        ebp
     mov         ebp,esp
     sub         esp,44
     .verts_ptr  equ dword[ebp-4]
     .tris_ptr   equ dword[ebp-8]
     .tris_cnt   equ dword[ebp-12]
     .verts_cnt  equ dword[ebp-16]
     .curr_v_ptr equ dword[ebp-20]
     .curr_t_ptr equ dword[ebp-24]
     .x          equ      [ebp-26]
     .y          equ word [ebp-28]
     .x2         equ word [ebp-30]
     .y2         equ word [ebp-32]
     .texx4      equ dword[ebp-36]
     .zzz        equ dword[ebp-40]
     .col        equ      [ebp-44]

     movzx      eax,[tolerancy_flag]
     add        eax,eax
     mov        .zzz,eax
     push       edi
     mov        ecx,TEXTURE_SIZE
     xor        eax,eax
     mov        ebx,0x00ffffff
    ; setting transparent col
    ; according this flag
     cmp        [disp_col_flag],1
     cmovne     eax,ebx
     mov        .col,eax
     mov        edi,texmap
     cld
     push       edi ecx
   @@:
     and        dword[edi],0x00ffffff
     add        edi,4
     loop       @b
     pop        ecx edi
     xor        ebx,ebx
   @@:
     repe       scasd
     or         ecx,ecx
     jz         @f
     inc        ebx
     repne      scasd
     or         ecx,ecx
     jnz        @b
   @@:
     pop        edi
     cmp        ebx,60000     ; is future object not too complex ??
     ja         .end
     mov        eax,TEX_X     ; max voxels size 512*512*12 * 4
     shl        eax,2
     mov        .texx4,eax
     imul       eax,eax
     shl        eax,2
;     add        eax,.texx4
;     add        eax,100
     push       eax   ;8 :: box
     malloc     eax
     mov        .verts_ptr,eax
     mov        .curr_v_ptr,eax
     pop        eax
     shl        eax,1
     malloc     eax
     mov        .tris_ptr,eax
     mov        .curr_t_ptr,eax
     mov        esi,texmap            ; search for any transparent pix
     mov        ecx,TEXTURE_SIZE
     cld
   .b11:
     lodsd
     cmp         eax,.col
     jz         .f6
     loop       .b11
     jmp        .end1
   .f6:
     xor        eax,eax
     mov        .tris_cnt,eax
     mov        .verts_cnt,eax
     movlps     xmm1,.col
     shufps     xmm1,xmm1,0
     mov        .y,ax
     mov        esi,.verts_ptr
   .loopy:
     xor        eax,eax
     mov        .x,ax
   .loopx:
     movzx      eax,word .x
     movzx      edx,.y
     mov        .x2,1
     shl        edx,TEX_SHIFT
     add        edx,eax
     shl        edx,2
     mov        ebx,.col
     lea        edi,[texmap+edx]
     cmp        ebx,[edi]
     je         .skip
   .fi_x2:
     xor        ebx,ebx
     ;=== find x2 to simplify obj
     mov        ecx,TEX_X
     sub        cx,.x
     push       esi
     xor        ebx,ebx
     mov        esi,edi
   @@:
     lodsd
     inc        ebx
     cmp        eax,.col
     jz         .f4
     loop       @b
  .f4:
     pop        esi
     mov        .x2,bx
     shl        ebx,2
     xor        eax,eax
     cmp        .x,ax
     je         .no_y_chck
     mov        .y2,ax
     ;=== find y2 to simplify obj
     sub        edi,.texx4
     bts        ecx,0
   .cmp_y:
     movlps     xmm0,[edi-4]
     add        edi,.texx4
     movlps     xmm1,[edi-4]
     inc        .y2
     cmpeqps    xmm0,xmm1
     movmskps   eax,xmm0
     and        al,11b
     btr        ecx,0
     jnc        .f133
     cmp        al,11b
     je         .skip  ;.no_y_chck
     jmp        .cmp_y
   .f133:
     cmp        al,11b
     je         .cmp_y
   .no_y_chck:
     dec        .y2
     xorps      xmm7,xmm7
     movzx      ebx,.x2
     cvtsi2ss   xmm7,ebx
     xorps      xmm5,xmm5
     movzx      ebx,.y2
     cvtsi2ss   xmm5,ebx
     movlps     xmm2,.x
     punpcklwd  xmm2,xmm5
     cvtdq2ps   xmm2,xmm2

     movups     [esi],xmm2     ; first vertex         1-----2
     addps      xmm2,xmm7      ; x+x2                 |    /|
                               ;                      |   / |
     movups     [esi+12],xmm2  ;                      |  /  |
     pslldq     xmm5,4         ;                      | /   |
     addps      xmm2,xmm5      ; y+y2                 4-----3
     movups     [esi+24],xmm2  ; x+x2  y+y2
     subps      xmm2,xmm7      ; x-x2
     movups     [esi+36],xmm2  ; x-x2  y+y2
     ; increase z
     xorps      xmm5,xmm5
     ; z coord ->  aprox flag dependent
     cvtsi2ss   xmm5,.zzz
     pslldq     xmm5,8
     mov        ecx,4
   .ll:
     movups     xmm4,[esi]
     addps      xmm4,xmm5      ; z+1
     movups     [esi+48],xmm4
     add        esi,12
     loop       .ll
     add        .tris_cnt,4
     add        .verts_cnt,8
     add        esi,48
   .skip:
     mov        ax,.x2
     add        .x,ax
     mov        dx,TEX_X - 1
     cmp        .x,dx
     jle        .loopx
     inc        .y
     cmp        .y,dx
     jne        .loopy
     ; do tri job        - I calculate only 4 tris
     ;                     in fact cube is 12 tris !!
     mov        edi,.tris_ptr
     xor        eax,eax
     mov        ecx,.tris_cnt
     shr        ecx,1
     cld
  .tril:
     push       ecx
     mov        edx,edi   ; curr ptr
     stosd                ; 1
     inc        eax
     stosd                ; 2
     mov        ebx,eax
     add        ebx,2
     xchg       eax,ebx
     stosd                ; 4
     ; 1st tri ready
     xchg       eax,ebx
     stosd                ; 2
     inc        eax
     stosd                ; 3
     inc        eax
     stosd                ; 4
     ; sec tri ready
     inc        eax
     ; 2 next tris
     pop        ecx
     loop       .tril
     xor        eax,eax
     cmp       .verts_cnt,eax
     je        .end1
     mfree     [triangles_ptr]
     mfree     [points_r_ptr]
     push      .verts_ptr
     push      .tris_ptr
     pop       [triangles_ptr]
     pop       [points_r_ptr]
     push      .verts_cnt
     pop       [points_count_var]
     push      .tris_cnt
     pop       [triangles_count_var]
     jmp       .end
   .end1:
     mfree     .verts_ptr
     mfree     .tris_ptr
   .end:
     mov        esp,ebp
     pop        ebp
ret
;=======================================================================
do_deformation_normals_depend:
 ; deformation according /next/ object ...
 ; and tolerancy flag
    push   ebp
    mov    ebp,esp
    and    ebp,-16
    sub    esp,200
    sub    ebp,32

    .scr_normals_ptr            equ dword[ebp-4]
    .points_rot                 equ dword[ebp-8]
    .n_line_proc                equ      [ebp-12]
    .Z_ptr                      equ dword[ebp-16]
    .B_scale                    equ      [ebp-32]
    .x_max                      equ dword[ebp-36]
    .x_min                      equ dword[ebp-40]
    .y_max                      equ dword[ebp-44]
    .y_min                      equ      [ebp-48]
    .trans                      equ      [ebp-64]
    .mxx                        equ      [ebp-100]
    .NextTrianglesCount         equ dword[ebp-108]
    .NextPointsCount            equ dword[ebp-112]

    .points_count_var           equ dword[ebp]
    .triangles_count_var        equ dword[ebp+4]
    .points_r_ptr               equ dword[ebp+8]
    .triangles_ptr              equ dword[ebp+12]
    .points_rotated_ptr         equ dword[ebp+16]
    .points_normals_rotated_ptr equ dword[ebp+20]

     cld
     lea       esi,[points_count_var]     ; globals to locals to
     lea       edi,.points_count_var      ; smake horter adresses
     mov       ecx,6
     rep       movsd
     push      [NextTrianglesCount]
     pop       .NextTrianglesCount
     push      [NextPointsCount]
     pop       .NextPointsCount
     or        eax,-1
     xor       esi,esi
     xor       ebx,ebx
     cmp       .NextTrianglesCount,esi
     cmove     ebx,eax
     cmp       .NextPointsCount,esi
     cmove     ebx,eax
     cmp       .NextTrianglesCount,eax
     cmove     ebx,eax
     cmp       .NextPointsCount,eax
     cmove     ebx,eax
     or        ebx,ebx
     jnz       .end
     mov       ebx,1000
     mov       .n_line_proc,dword normal_line ;do_deformation_normals_depend
     mov       .y_min,esi
     mov       .x_min,esi
     mov       .y_max,ebx
     mov       .x_max,ebx

    mov       eax,.points_count_var
    push      eax
    imul      eax,12
    add       eax,1000
    malloc    eax
    mov       .points_rot,eax
    pop       ecx
    mov       esi,.points_rotated_ptr
    movss     xmm0,[scale]
    shufps    xmm0,xmm0,0
    rcpps     xmm0,xmm0
    cvtdq2ps  xmm1,[xxadd]
    mov       eax,.points_rot
  @@:
    movups    xmm2,[esi]
    subps     xmm2,xmm1
    mulps     xmm2,xmm0
    movups    [eax],xmm2
    add       eax,12
    add       esi,12
    loop      @b

    mov       eax,480
    cvtsi2ss  xmm2,eax
    shufps    xmm2,xmm2,0
    movaps    .trans,xmm2
    mov       eax,960
    cvtsi2ss  xmm1,eax
    shufps    xmm1,xmm1,0
    movaps    .B_scale,xmm1
    mov       eax,1000 * 1000 * 12 + 1000
    malloc    eax
    mov       .scr_normals_ptr,eax   ; screen here i will store normal vectors
    mov       eax,1050 * 1000 * 4 + 1000
    malloc    eax
    mov       .Z_ptr,eax         ;
    mov       edi,eax
    mov       ecx,1000 * 1000
    mov       eax,60000.1
    cld
    rep       stosd
    mov       eax,.triangles_count_var
    mov       ecx,.NextTrianglesCount
    sub       eax,ecx
    mov       esi,eax
    imul      esi,12
    add       esi,.triangles_ptr
  .again_stencil_tri:
  @@:
    cld
    push      ecx
    lodsd
    xchg      ecx,eax
    lodsd
    xchg      ebx,eax    ; shorter than mov  ebx,eax
    lodsd
    xchg      ecx,eax
    push      esi
    ; scale should be - 1000 X 1000
    imul      eax,12
    imul      ebx,12
    imul      ecx,12
    mov       edx,.points_rot
    movss     xmm3,[edx+eax+8]
    movss     xmm4,[edx+ebx+8]
    movss     xmm5,[edx+ecx+8]
    movups    xmm0,[eax+edx]
    movups    xmm1,[ebx+edx]
    movups    xmm2,[ecx+edx]
    shufps    xmm4,xmm4,11110011b
    orps      xmm3,xmm4
    movlhps   xmm3,xmm5     ; xmm3 - z float pack
    ; U may scale according to whole 'next' part of object
    mulps     xmm0,.B_scale
    mulps     xmm1,.B_scale
    mulps     xmm2,.B_scale
    addps     xmm0,.trans
    addps     xmm1,.trans
    addps     xmm2,.trans
    cvtps2dq  xmm0,xmm0
    cvtps2dq  xmm1,xmm1
    cvtps2dq  xmm2,xmm2
    packssdw  xmm0,xmm0
    packssdw  xmm1,xmm1
    packssdw  xmm2,xmm2
    punpckldq xmm0,xmm1
    sub       esp,12
    movlps    [esp],xmm0
    movss     [esp+8],xmm2
    mov       edx,.points_normals_rotated_ptr
    movups    xmm0,[eax+edx]
    movups    xmm1,[ebx+edx]
    movups    xmm2,[ecx+edx]
    pop       eax ebx ecx
    ror       eax,16
    ror       ebx,16
    ror       ecx,16
    movaps    xmm4,xmm3
    movups    xmm5,.y_min
    movlps    xmm7,.n_line_proc
    mov       edi,.scr_normals_ptr ; screen
    mov       esi,.Z_ptr
    pushad
    call      glass_tex_tri
    popad
    pop       esi
    pop       ecx
 ;   add       esi,12
 ;   loop     @b
    dec       ecx
    jnz       .again_stencil_tri
    ; TRANSFORM - DEFORMATE ACCORDING TO NORMAL MAP
    mov      eax,33       ; factor
    movzx    ecx,[tolerancy_flag]
    cvtsi2ss xmm3,eax
    cvtsi2ss xmm5,ecx
    mulps    xmm3,xmm5
    shufps   xmm3,xmm3,0

    movups   xmm5,.trans
    mov      ecx,.points_count_var
 ;   push     ecx
    sub      ecx,.NextPointsCount
    push     ecx
    mov      esi,.points_rot
    mov      eax,995
    cvtsi2ss xmm6,eax
    shufps   xmm6,xmm6,0
    xorps    xmm5,xmm5
  .bb:
    movups   xmm0,[esi]
    mulps    xmm0,.B_scale
    movaps   xmm7,xmm0
    addps    xmm0,.trans
    minps    xmm0,xmm6
    maxps    xmm0,xmm5
    cvtps2dq xmm4,xmm0
    sub      esp,8
    movlps   [esp],xmm4
    pop      eax
    pop      ebx
    imul     ebx,1000
    add      eax,ebx
    imul     eax,12
    add      eax,.scr_normals_ptr
    movups   xmm2,[eax]    ; normal vector
    mulps    xmm2,xmm3
    addps    xmm7,xmm2
    movlps   [esi],xmm7
    movhlps  xmm7,xmm7
    movss    [esi+8],xmm7
    add      esi,12
    loop     .bb

    mfree    .scr_normals_ptr
    mov      esi,matrix_scaled
    lea      edi,.mxx
    call     reverse_mx_3x3
  ; keep in mind in .points_rot vertices no translated
    pop      ecx
    mov      eax,ecx
    mov      esi,.points_rot
    mov      edi,.points_r_ptr
    lea      ebx,.mxx
    call     rotary
    mfree    .points_rot
    mfree    .Z_ptr
   .end:
    add      esp,200
    pop      ebp
ret
;=======================================================================
rm_inner_faces:
;  remove inside faces, whole face must reside
;  in closed volume of object
   push    ebp
   mov     ebp,esp
   sub     esp,36
   .trii_count          equ dword[ebp-4]
   .trii                equ dword[ebp-8]
   .mem_start           equ dword[ebp-12]
   .thread_params       equ dword[ebp-16]
   .points_count_var    equ dword[ebp-32]
   .triangles_count_var equ dword[ebp-28]
   .points_r_ptr        equ dword[ebp-24]
   .triangles_ptr       equ dword[ebp-20]
   .value               equ dword[ebp-36]
   prompt  prompt_inside_fac
   cld
   lea      esi,[points_count_var]
   lea      edi,.points_count_var
   mov      ecx,4
   rep      movsd
   mov      .thread_params,thread_params
   mov      .value,1050*1050*4+65536
   xor       ebx,ebx
 .alloc:
   malloc   .value
   mov      edx,.thread_params
   mov      [edx+ebx],eax
   malloc   .value
   mov      edx,.thread_params
   mov      [edx+ebx+4],eax
   mov      eax,.triangles_count_var
   add      eax,65536
   malloc   eax
   mov      edx,.thread_params
   mov      [edx+ebx+8],eax
   mov      eax,.points_count_var
   imul     eax,12
   add      eax,30
   malloc   eax
   mov      edx,.thread_params
   mov      [edx+ebx+12],eax
   add      ebx,16
   cmp      ebx,16*4
   jne      .alloc


    mov        eax,rm_inner_faces_th
    xor        edx,edx
    mov        ecx,4
    call       call_thread
    mov        ebx,16      ; merge visable triangles list
 .nx_tri_p:                ; next tri list params
    mov        edx,.thread_params
    mov        edi,[edx+8]
    mov        esi,[edx+8+ebx]
    mov        ecx,.triangles_count_var
  .mv:
    cmp        byte[esi],1
    jne        @f
    mov        al,[esi]
    mov        [edi],al
   @@:
    inc        esi
    inc        edi
    loop       .mv
    add        ebx,16
    cmp        ebx,64
    jne        .nx_tri_p
    mov        edx,.thread_params
    mov        esi,[edx+8]    ; count visable tris
    mov        ecx,.triangles_count_var
    xor        ebx,ebx
   .cnt:
    cmp        byte[esi],1
    jne        @f
    inc        ebx
   @@:
    inc        esi
    loop       .cnt
    or         ebx,ebx
    jz         .free_all
    mov        .trii_count,ebx
    imul       ebx,12
    add        ebx,1024
    malloc     ebx
    mov        .trii,eax
    mov        edx,.thread_params
    mov        ebx,[edx+8] ; renew triangles list
    mov        esi,.triangles_ptr
    mov        edi,.trii
    mov        ecx,.triangles_count_var
  .move:
    cmp        byte[ebx],1
    jne        @f
    movsd
    movsd
    movsd
    inc         ebx
    loop       .move
    jmp        .enm
   @@:
    add        esi,12
    inc        ebx
    loop       .move
  .enm:
    mfree      [triangles_ptr]
    push       .trii
    pop        [triangles_ptr]
    push       .trii_count
    pop        [triangles_count_var]
  .free_all:
    xor        ebx,ebx
  @@:
    mov        edx,.thread_params
    mfree      [edx+ebx]
    add        ebx,4
    cmp        ebx,64
    jne        @b
    cls
    mov        esp,ebp
    pop        ebp
ret
;======================================================================
rm_inner_faces_th:
;  Remove inside faces.
; in:
;  ebx == -1 -> leave only front, otherwise ->
;   -> ebx = thread No
;  edi -> vertices to project
;  in threaded cause :
;   dword[thread_params]    -  .z_buff     - zbuffer
;   dword[thread_params+4]  -  .prj_buff   - project buffer
;   dword[thread_params+8]  -  .tri_buff   - triangles buffer
;   dword[thread_params+12] -  .points_rot - vertices rotated buffer

    push        ebp
    mov         ebp,esp
    and         ebp,-16
    sub         ebp,128+16   ; to increase shorter adresses amount
    sub         esp,292

;  .z_buff      equ dword[ebp+4]
;  .prj_buff    equ dword[ebp+8]
   .tri_buff    equ dword[ebp+12]
   .side0       equ dword[ebp+16]
   .tri_count   equ dword[ebp+20]
   .tri         equ dword[ebp+24]
   .sinx        equ dword[ebp+28]
   .cosx        equ dword[ebp+32]
   .siny        equ dword[ebp+36]
   .cosy        equ dword[ebp+40]
   .sinz        equ dword[ebp+44]
   .cosz        equ dword[ebp+48]
   .points_rot  equ dword[ebp+52]
   .side1       equ dword[ebp+56]
   .side2       equ dword[ebp+60]
   .matrix      equ      [ebp+64]
   .tri_buffe   equ      [ebp+100]
   .only_front  equ dword[ebp+104]
   .vert_to_prj equ dword[ebp+108]
   .start0      equ dword[ebp+112]
   .start1      equ dword[ebp+116]
   .start2      equ dword[ebp+120]
   .end0        equ dword[ebp]
   .end1        equ dword[ebp-4]
   .end2        equ dword[ebp-8]
   .step        equ dword[ebp-12]
  ; .i12        equ dword[ebp-16]
  ; .f500x3     equ      [ebp-32]
   .z_buff      equ dword[ebp-36]  ; \
   .prj_buff    equ dword[ebp-40]  ;  >  dont change
   .flat_line2  equ dword[ebp-44]  ;  >  order
   .shl1000     equ      [ebp-48]  ; /
   .mln         equ      [ebp-52]
   .points_count_var           equ dword[ebp-76]
   .triangles_count_var        equ dword[ebp-72]
   .points_r_ptr               equ dword[ebp-68]
   .triangles_ptr              equ dword[ebp-64]
   .points_rotated_ptr         equ dword[ebp-60]
   .points_normals_rotated_ptr equ dword[ebp-56]

   .x_max                      equ dword[ebp-80]
   .x_min                      equ dword[ebp-84]
   .y_max                      equ dword[ebp-88]
   .y_min                      equ      [ebp-92]

   cld
   lea     esi,[points_count_var]
   lea     edi,.points_count_var
   mov     ecx,6
   rep     movsd
   mov     eax,1000
   xor     edx,edx
   mov     .y_min,edx
   mov     .x_min,edx
   mov     .y_max,eax
   mov     .x_max,eax
   imul    eax,eax
   mov     .mln,eax
   mov     .only_front,ebx      ; th_no
   mov     .vert_to_prj,edi
   mov     .shl1000,dword (1000 shl 16 + 1000)
   mov     .flat_line2,flat_line2

   cmp     ebx,-1
   jne     .no_crop_front
   mov     ebx,-17
   mov     .start0,ebx
   mov     .start1,ebx
   mov     .start2,ebx
   neg     ebx
   mov     .end0,ebx
   mov     .end1,ebx
   mov     .end2,ebx
   dec     ebx
   mov     .step,ebx
   ; Alloc 1000x1000  buffs
   mov     ebx,1050*1050*4+65536
   malloc  ebx
   mov     .z_buff,eax
   malloc  ebx
   mov     .prj_buff,eax
   mov     eax,.triangles_count_var
   add     eax,65536
   malloc  eax
   mov     .tri_buff,eax
   mov     eax,.points_count_var
   imul    eax,12
   add     eax,30
   malloc  eax
   mov     .points_rot,eax
   jmp     .ff
 .no_crop_front:
   mov     ecx,64
   mov     eax,.only_front   ; th no
   imul    ecx,eax

   xor     ebx,ebx
   mov     .start0,ecx
   mov     .start1,ebx
   mov     .start2,ebx
   mov     ebx,257
   add     ecx,64
   inc     ecx
   mov     .end0,ecx
   mov     .end1,ebx
   mov     .end2,ebx
   mov     .step,32

   shl     eax,4
   add     eax,thread_params
   push    dword[eax]
   pop     .z_buff
   push    dword[eax+4]
   pop     .prj_buff
   push    dword[eax+8]
   pop     .tri_buff
   push    dword[eax+12]
   pop     .points_rot
 .ff:
   mov     edi,.points_r_ptr
   mov     .vert_to_prj,edi
   cld
   mov     ecx,[triangles_count_var]
   mov     edi,.tri_buff
   xor     eax,eax
   shr     ecx,2
   add     ecx,3000
   rep     stosd
; Do paralell projection.
; Render faces with index of face as a color.
   push    .start0
   pop     .side0
 .next_side0:
   push    .start1
   pop     .side1
 .next_side1:
   push    .start2
   pop     .side2
 .next_side2:
   mov     eax,.side0
   mov     ebx,.side1
   mov     edx,.side2
   lea     edi,.matrix
   call    make_matrixx
   mov     esi,.vert_to_prj   ; vertices to project
   mov     edi,.points_rot
   mov     ecx,.points_count_var
   lea     ebx,.matrix
   call    rotary
   cld
   mov     ecx,.mln ;1000*1000
   mov     edi,.prj_buff
   mov     eax,-1
   rep     stosd
   mov     ecx,.mln ;1000*1000
   mov     edi,.z_buff
   mov     eax,60000.1
   rep     stosd
   movups  xmm5,.y_min
   mov     edx,.points_rot
   lea     ebx,[f490x3]
   lea     ecx,[f500x3]
   movups  xmm1,.shl1000
   mov     eax,'innf'      ;  - inner faces remove cause
   call    do_stencil

; check if tri index in buff
   mov     esi,.prj_buff
   mov     ecx,.mln ;1000*1000
   cld
 .chck:
   lodsd
   cmp     eax,.triangles_count_var
   jae     @f
   cmp     eax,-1
   je      @f
   add     eax,.tri_buff
   mov     byte[eax],1    ; 1 - tri outside, 0 - inside
  @@:
   loop    .chck

   mov     ecx,.step
   add     .side2,ecx
   mov     ebx,.end2
   cmp     .side2,ebx
   jng     .next_side2
   add     .side1,ecx
   mov      ebx,.end1
   cmp     .side1,ebx
   jng     .next_side1
   add     .side0,ecx
   mov     ebx,.end0
   cmp     .side0,ebx
   jng     .next_side0
   cmp     .only_front,-1   ; only front faces ?
   jne     .end             ; end if no front
 ;  count triangles outside
   mov     esi,.tri_buff
   mov     ecx,.triangles_count_var
   xor     ebx,ebx
 .cnt:
   cmp     byte[esi],1
   jne     @f
   inc     ebx
  @@:
   inc     esi
   loop    .cnt
   mov     .tri_count,ebx
   or      ebx,ebx
   jz      .no_ch
   imul    ebx,12
   add     ebx,1024
   malloc  ebx
   mov     .tri,eax
   mov     ebx,.tri_buff  ; renew triangles list
   mov     esi,.triangles_ptr
   mov     edi,.tri
   mov     ecx,.triangles_count_var
 .move:
   cmp     byte[ebx],1
   jne     @f
   movsd
   movsd
   movsd
   inc     ebx
   loop    .move
   jecxz   .fr
  @@:
   add     esi,12
   inc     ebx
   loop    .move
 .fr:
   mfree   .tri_buff
   mfree   .triangles_ptr
   push    .tri
   pop     [triangles_ptr]
   push    .tri_count
   pop     [triangles_count_var]
 .no_ch: ; no change ion tri list
   mfree   .prj_buff
   mfree   .z_buff
   cls
 .end:
   mov     eax,.tri_buff
   add     esp,292
   pop     ebp
ret
;===============================================================
rem_cracks:
; remove cracks after only part of object tris tesselation
; 'part' - triangles with positiv norm vect and/or from selected area

       push   ebp
       mov    ebp,esp
       sub    esp,290
       and    ebp,-16
       sub    ebp,80

     .cur_vi           equ dword[ebp-4]
     .ed_no            equ dword[ebp-8]
     .sec_vi           equ dword[ebp-12]
     .fst_vi           equ [ebp-16]
     .eps              equ xword[ebp-32]
     .endt             equ dword[ebp-36]
     .t_ptr            equ dword[ebp-40]
     .curp             equ dword[ebp-44]
     .piv2             equ dword[ebp-48]
     .n_tri_cnt        equ dword[ebp-52]
     .f03              equ dword[ebp-56]
   ;  .points_r         equ dword[ebp-60]
   ;  .edges            equ dword[ebp-64]
     .normVV           equ xword[ebp-80]
     .veca             equ xword[ebp-80-16]
     .vecb             equ xword[ebp-80-32]
     .cur_new_tri_ptr  equ dword[ebp-116]
     .piv              equ dword[ebp-120]
     .len              equ dword[ebp-124]

   ;  .zero_hgst           equ [ebp+80]
   ;  .f05x3               equ [ebp+96]

     .points_count_var    equ dword[ebp]
     .triangles_count_var equ dword[ebp+4]
     .points_r            equ dword[ebp+8]
     .triangles_ptr       equ dword[ebp+12]
     .points_rotated_ptr  equ dword[ebp+16]
     .points_n_ro_ptr     equ dword[ebp+20]
     .edges               equ dword[ebp+24]
     .edges_count         equ dword[ebp+28]
     .edge_s_d_ptr        equ dword[ebp+32]
     .dir                 equ [ebp+48]
     .sign_mask           equ [ebp+64]

     push    esi edi
     cld
     lea     esi,[points_count_var]
     lea     edi,.points_count_var
     mov     ecx,9
     rep     movsd
     movaps  xmm0,[sign_mask]
     movaps  .sign_mask,xmm0
     pop     edi esi

     prompt  prompt_rm_cracks

     mov     .f03,dword 0.15
     mov     eax,0.0007
     movd    xmm0,eax
     shufps  xmm0,xmm0,0
     movaps  .eps,xmm0

     mov     ebx,.edges_count
     shl     ebx,2
     add     ebx,1024
     malloc  ebx
     mov    .t_ptr,eax
     shl     ebx,2
     push    ebx
     malloc  ebx
     mov     .piv,eax
     pop     ebx
     malloc  ebx
     mov    .piv2,eax
                                    ; do pivot list
     xor     ecx,ecx
     mov     edi,eax
     mov     esi,.edges
   @@:
     mov     eax,ecx
     movsd
     stosd
     movsd
     stosd
     inc     ecx
     cmp     ecx,.edges_count
     jnz     @b
     mov     esi,.piv2
     mov     edi,.piv
     mov     ecx,.edges_count
     add     ecx,ecx
     call    sort_hybrid
;    sort_hybrid  calling conv
; in:  esi - 1st table
;      edi - sec table
;      ecx - tables_units count - unit 4 bytes
; out:
;      sorted first table
     mfree   .piv
     mov     eax,.triangles_count_var
;    mov     ebx,[edges_d_count]
;    sub     ebx,.edges_count  ; ebx - single edes numb
;    add     ecx,ebx
     add     eax,.edges_count
     imul    eax,12
     add     eax,1024
     malloc  eax
     ; new triangles
     mov    .piv,eax
     mov     edi,eax
     mov     esi,.triangles_ptr
     mov     ecx,.triangles_count_var
     lea     ecx,[ecx*3]
     cld
     rep     movsd
     mov     .cur_new_tri_ptr,edi
     xor     eax,eax
     mov     ebx,.piv
     mov     .n_tri_cnt,eax
     ;place for new tris
     mov     ecx,.edges_count
     add     ecx,ecx
     mov     esi,.piv2
     mov     eax,ecx
     shl     eax,2
     add     eax,esi
     mov     ebx,eax
     add     ebx,8
     mov     edi,.t_ptr       ; table of pointers/adresses
     cld
     mov     [edi],esi
     add     edi,4
     dec     ecx
     jz      .dn44
    .pt:
     lodsd                    ; [esi]   - vert ind
     add     esi,4
     cmp     eax,[esi]        ; [esi+4] - tri ind
     je      @f
     mov     eax,esi
     stosd                    ; ptr to piv table
     dec     ecx
     jz      .dn44
    @@:
     cmp     esi,ebx  ;.endt
     jb      .pt

  .dn44:   ; done
     cld
     mov      ecx,.edges_count
     mov      .endt,ecx
     mov      esi,.edges
     xor      ecx,ecx
  .nxx_edge:                      ; search for cracks/slits
     push     ecx
     mov      .ed_no,ecx
     lodsd                        ; ed first
     mov      .fst_vi,eax         ; 1 vertex
     xchg     ebx,eax
     lodsd
     push     esi                 ; 2vert
     mov      .sec_vi,eax         ; ed sec
 ;    mov      esi,ecx
 ;    and      ecx,111b
 ;    shr      esi,3
 ;    add      esi,.edge_s_d_ptr
 ;    bt       word[esi],cx   ; is single?
 ;   jc       .nx_ed         ; is double!
     imul     eax,12
     imul     ebx,12
     add      eax,.points_r
     add      ebx,.points_r
     movups   xmm7,[eax]
     movups   xmm6,[ebx]
     movaps   .veca,xmm7
     movaps   .vecb,xmm6
     subps    xmm6,xmm7      ; I tried more general calculations
                             ; direction dependend, I abandon coz numerical
     movaps   xmm7,xmm6      ; errors - values near zero fight
     dpps     xmm6,xmm6,01110111b
     sqrtps   xmm6,xmm6
     movss    .len,xmm6
     rcpps    xmm6,xmm6
     mulps    xmm7,xmm6    ;xm7 - direction vect of cur edge
     movaps   .dir,xmm7
     mov      eax,.fst_vi  ;xmm0    ; 1st vert ind
     mov      .cur_vi,eax
   .sec:      ; sec v index
     shl      eax,2
     add      eax,.t_ptr
     mov      esi,eax      ; esi - addr in piv tab
     lodsd                 ; vert index  ;search for other ed
     or       eax,eax
     jz       .nx_ed
     mov      .curp,eax
   .nx:
     mov      esi,.curp
     lodsd                 ; piv
     xchg     ebx,eax
     lodsd
     mov      .curp,esi    ; cur ptr
     cmp      ebx,.cur_vi  ; still the same vert index?
     jne      .nx_ed_in
     cmp      eax,.ed_no   ;  the same ed?
     je       .nx
     mov      esi,eax
     shl      esi,3
     add      esi,.edges
     movlps   xmm7,[esi]
     lodsd
     xchg     ebx,eax
     lodsd
     jmp      .check
   .nx_ed_in:
     mov      ebx,.cur_vi
     mov      eax,.sec_vi
     mov      .cur_vi,eax
     cmp      ebx,eax
     jne      .sec
     ; sec index of cur ed
   .nx_ed:
     pop      esi
     pop      ecx
     inc      ecx
     cmp      ecx,.endt
     jnz     .nxx_edge
     jmp     .end
   .check:

  ; .check_deeper:
     ;    .fst_vi,  .sec_vi - cur ed v indexes - dir .dir
     ;     verts: xm6, xm5 [ebx], [eax]
     ;     xmm7 - 2 ed v indexes - dir the same
     movd     edx,xmm7
     cmp      edx,.cur_vi
     jne      @f
     shufps   xmm7,xmm7,00010001b
  @@:
     movd     edx,xmm7
     sub      esp,8
     movlps   [esp],xmm7
     pop      eax esi
     imul     esi,12
     imul     eax,12
     add      eax,.points_r
     add      esi,.points_r
     movups   xmm6,[eax]
     movups   xmm5,[esi]
     subps    xmm6,xmm5

     movaps   .normVV,xmm6
     lea      edi,.normVV
     call     normalize_vector
     lea      esi,.dir
     call     dot_product
     movaps   xmm5,xmm0
     subps    xmm5,.eps  ;xmm4     ; check dot
     addps    xmm0,.eps  ;xmm4
     movaps   xmm4,[the_one]
     cmpltps  xmm5,xmm4
     cmpltps  xmm0,xmm4
     xorps    xmm0,xmm5
     movmskps esi,xmm0
     and      esi,1
     cmp      esi,1
     jne      .nx

if 0
  ;  lowest dd xmm7 = edx ->  significant index
  ;  calc margin
     imul     edx,12
     add      edx,.points_r
     movups   xmm6,[edx]

     movaps   xmm1,[the_one]
     subps    xmm1,.eps

     movaps   xmm5,.veca
     subps    xmm5,.vecb
     subps    xmm6,.vecb
     movaps   xmm2,xmm6
     ; 0.0 <= dot (p1-p0,p-p0)/|p-p0| = 1.0
     dpps     xmm5,xmm6,01110111b
     mulps    xmm5,xmm5
     dpps     xmm2,xmm2,01110111b
;     movd     eax,xmm2
;     or       eax,eax
;     jz       @f
     rcpps    xmm2,xmm2
     mulps    xmm5,xmm2
;   @@:
     movaps   xmm6,xmm5
     cmpltps  xmm6,xmm1
     cmpltps  xmm5,.eps
     xorps    xmm6,xmm5
     movmskps eax,xmm6
     and      eax,1
     cmp      eax,1
     jne      .nx
end if

  ;  lowest dd xmm7 -  significant index
  ;  calc margin
     imul     edx,12
     add      edx,.points_r
     movups   xmm3,[edx]
     xor      edx,edx
   @@:
     movaps   xmm1,.veca  ; xmm6  ;[ebx]
     addps    xmm1,xmm3
     dpps     xmm1,xmm1,01110111b
     sqrtps   xmm1,xmm1   ; 1st lengut
     bts      edx,1
     jc       @f
     xorps    xmm3,.sign_mask
     ; must be shorter than .lenght   ; think about situation
     comiss   xmm1,.len               ; two leghts shorter than original 'len'
     ja       @b                      ; but its sum - dosnt match....
   @@:                                ; - vert from edx must be inbetween .a and .b
     xor      edx,edx
   @@:
     movaps   xmm2,.vecb ;xmm5  ;[eax]
     addps    xmm2,xmm3
     dpps     xmm2,xmm2,01110111b
     sqrtps   xmm2,xmm2   ; 2cond lengut
     bts      edx,1
     jc       @f
     xorps    xmm3,.sign_mask
     ; must be shorter than .lenght
     comiss   xmm2,.len
     ja       @b
   @@:
     addps    xmm2,xmm1
     movaps   xmm1,xmm2
     subps    xmm1,.eps
     addps    xmm2,.eps
     cmpltss  xmm1,.len
     cmpltss  xmm2,.len
     xorps    xmm1,xmm2
     movmskps edx,xmm1    ; is sum of lenghts equal to len ??
     and      edx,1b
     cmp      edx,1b
     jne      .nx         ; next index from pivot lst edge


     movlps   xmm1,.fst_vi                ; fix tri list
     movaps   xmm2,xmm1
     shufps   xmm1,xmm1,0
     shufps   xmm2,xmm2,01010101b

     mov      esi,.piv
     xor      ecx,ecx
     mov      ebx,.triangles_count_var
  .lllv:
     movups   xmm3,[esi]
     movaps   xmm4,xmm1
     movaps   xmm0,xmm2
     pcmpeqd  xmm4,xmm3
     pcmpeqd  xmm0,xmm3
     orps     xmm0,xmm4
     movmskps eax,xmm0
     and      eax,111b
  ;   cmp      eax,011b
  ;   jz       .ok1
  ;   cmp      eax,101b
  ;   jz       .ok1
  ;   cmp      eax,110b
  ;   jz       .ok1

     popcnt   edx,eax
     cmp      edx,2
     je       .ok1

   @@:
     add      esi,12
     inc      ecx
     cmp      ecx,ebx
     jnz      .lllv
     jmp      .nx
     ; ecx - tri to divide  index
    .ok1:
     bt       eax,0
     jnc      @f
     shufps   xmm3,xmm3,11001001b
     bt       eax,1
     jnc      @f
     shufps   xmm3,xmm3,11001001b
    @@:

     movaps   xmm4,xmm3
     mov      edi,esi
     lodsd
     xchg     ebx,eax
     lodsd
     xchg     ecx,eax
     lodsd
     mov      edx,12
     imul     ebx,edx
     add      ebx,.points_r
     imul     ecx,edx
     add      ecx,.points_r
     imul     eax,edx
     add      eax,.points_r
     movups   xmm1,[eax]
     movups   xmm2,[ebx]
     movups   xmm0,[ecx]
     or       eax,-1
     stosd    ; mark
     stosd
     stosd

     subps    xmm0,xmm1
     subps    xmm1,xmm2
     call     cross_reg

 ;    dpps    xmm1,xmm0,01110111b
 ;    sqrtps  xmm1,xmm1
 ;    divps   xmm0,xmm1   ;   maybye just chck sign of 'Z' coorf ??
     movaps   .normVV,xmm0

     ;  3 2 1 xm4 & 1 xm7
     ; tris inds
     ; both tris have  llxm4
     ; lh & hl xm4 indexs - xm6, xm5 verts
     ; llxm4: 3rd index of tri  @ .tri_ptr
     ; chck its senses (whole vec) of norm vects, must like in .normVV

                             ; sec tri v indexes -
                             ; xmm7 lowst and hgst
                             ; and dword on stack
     movaps    xmm1,xmm4
     movlhps   xmm1,xmm7
     shufps    xmm1,xmm1,11111000b
     movhps    xmm1,.fst_vi   ; xmm1 - 1b         1a     2a     3a
                              ;         scxm0    lwxm0  lwxm7  lwst xm6
     movaps    xmm7,xmm1
     shufps    xmm1,xmm1,11100001b
     shufps    xmm7,xmm7,10110100b
     movaps    xmm3,xmm1
     movhlps   xmm2,xmm1
     shufps    xmm1,xmm1,11100001b
     ; xm1 - xm3 indices
     mov       ecx,2
     cld
     mov       edi,.cur_new_tri_ptr
     movaps    xmm6,.normVV
     ; movaps     xmm5,.eps
   .nxx_new_tri:
     movd      eax,xmm1
     mov       ebx,eax
     stosd
     movd      eax,xmm2
     mov       edx,eax
     stosd
     movd      eax,xmm3
     stosd                   ; renew   tris
     imul      ebx,12
     imul      edx,12
     imul      eax,12
     add       ebx,.points_r
     add       edx,.points_r
     add       eax,.points_r
     movups    xmm0,[eax]
     movups    xmm1,[ebx]
     movups    xmm2,[edx]
     subps     xmm0,xmm1
     subps     xmm1,xmm2
     call      cross_reg
  ;   destorys xm0 - xm3
  ;   dpps      xmm2,xmm0,01110111b
  ;   sqrtps    xmm2,xmm2
  ;   divps     xmm0,xmm2
     xorps     xmm0,xmm6
   ;  movaps    xmm1,xmm0  ; check only signs !!
   ;  addps     xmm1,xmm5  ;.eps
   ;  subps     xmm0,xmm5  ;.eps
   ;  cmpltps   xmm1,xmm6  ;.normVV
   ;  cmpltps   xmm0,xmm6  ;.normVV
   ;  xorps     xmm1,xmm0  ;.normVV
     movmskps  eax,xmm0
     and       eax,111b
     cmp       eax,111b
     jne       @f
     mov       edx,[edi-4]
     mov       eax,[edi-8]
     mov       [edi-4],eax  ; swap
     mov       [edi-8],edx
   @@:
     movaps    xmm2,xmm7
     movaps    xmm1,xmm7
     movhlps   xmm3,xmm7
     shufps    xmm2,xmm2,11100001b
     loop      .nxx_new_tri
     inc       .n_tri_cnt
     inc       .n_tri_cnt
     mov       .cur_new_tri_ptr,edi
     jmp       .nx_ed  ;_chck
   .end:
     mov       ebx,.n_tri_cnt
     or        ebx,ebx
     jz        .end22
     mfree     .triangles_ptr
     push      .piv
     pop       [triangles_ptr]
     mov       ecx,.n_tri_cnt
     add       [triangles_count_var],ecx
     call      remove_non_tri
     jmp       .end3
  .end22:
     mfree     .piv
  .end3:
     mfree     .piv2
     mfree     .t_ptr
     cls
     mov       ecx,.n_tri_cnt
     add       esp,290
     pop       ebp
ret
;===================================================================
call_thread:
; in eax  - proc-th adress
;    edx -  param bitewise 'and' with param passed to proc_th
;    ecx -  max thread count, if -1 no in use
;
;    if ecx = 'max' use maximum thread in [CoresCount]

     push   ebp
     mov    ebp,esp
     push   edx
     push   eax
     push   ecx

     .par1       equ dword[ebp-4]
     .procTh     equ dword[ebp-8]
     .maxTh      equ dword[ebp-12]
     .ptr        equ dword[ebp-16]
     .hthread_base  equ dword[ebp-20]
     .ThreadID_base equ dword[ebp-24]

     mov     eax,3000
     malloc  eax
     push    eax  ; ptr

     push    eax  ; base1
     add     eax,1600
     push    eax  ; base2

     mov     ecx,.maxTh
     cmp     ecx,'max'
     jne     @f
     push    [CoresCount]
     pop     .maxTh
     jmp     .fff
   @@:
     mov    ebx,4
     cmp    ecx,-1
     cmove  ecx,ebx
     mov    .maxTh,ecx
  .fff:
;     lea    eax,.hthread1
;     lea    edx,.ThreadID1
;     cmp    ecx,1
;     jne    .ff
;     mov    .hthread_base,eax
;     sub    eax,4
;     mov    .ThreadID_base,edx
;     mov    .maxTh,ecx
;     jmp    .fff
;   .ff:
;     mov    .hthread_base,hthread0
;     mov    .ThreadID_base,ThreadID0
     xor     ebx,ebx
   .th_l:
     push    ebx
     mov     eax,ebx
     or      eax,.par1
     mov     edx,.ThreadID_base
     invoke  CreateThread,NULL,NULL,.procTh,\
             eax,CREATE_SUSPENDED,0 ;,\
          ;   [edx+ebx*4]
  ;   invoke  CreateThread,0,0,r15,rbx,CREATE_SUSPENDED,0
     shl     ebx,2
     add     ebx,.hthread_base
     mov     [ebx],eax
;     invoke  SetThreadPriority,ebx,NORMAL_PRIORITY_CLASS
     pop     ebx
     inc     ebx
     cmp     ebx,.maxTh
     jne     .th_l

     xor     ebx,ebx
   @@:
     mov     eax,.hthread_base
     invoke  SetThreadAffinityMask,[eax+ebx*4],0 ;1
     inc     ebx
     cmp     ebx,.maxTh
     jne     @b

     xor     ebx,ebx
   @@:
     mov     eax,.hthread_base
     invoke  ResumeThread,[eax+ebx*4]
     inc     ebx
     cmp     ebx,.maxTh
     jne     @b
     mov     eax,.hthread_base
     invoke  WaitForMultipleObjectsEx,.maxTh,eax,1,-1,0

     xor     ebx,ebx
   @@:
     mov     eax,.hthread_base
     invoke  CloseHandle,[eax+ebx*4]
     inc     ebx
     cmp     ebx,.maxTh
     jne     @b
     mfree   .ptr
     mfree   .ThreadID_base
     mov     esp,ebp
     pop     ebp
ret
;=============================================================================
del_tiv_ie_without:
;   delete triangles with at last one inside vertex,
;   and without intersecting edges.
; in:  esi - ptr to inside vertices mask list
;      ebx - ptr to intersecting edges mask list
; out: corrected triangles list.
   push     ebp
   mov      ebp,esp
   .in_vr     equ dword[ebp-4]  ; ptr to inside vertices list
                                ; if correspond bit is zeroed ->
                                ; -> vertex is inside object

   .clidd_ed  equ dword[ebp-8]  ; ptr to colidded edges list
                                ; each bit in list -> edge
   push     esi                   ; if bit is set->  edge intersect
   push     ebx
   xor      ecx,ecx
 .main_loop:
   push     ecx
   mov      esi,ecx
   mov      ebx,ecx
   shr      esi,3
   and      ebx,111b
   add      esi,.in_vr
   bt       [esi],ebx
   jc       .nx_ed

   movd     xmm0,ecx      ; vert inside, check if tri has edge that intersect
   shufps   xmm0,xmm0,0
   mov      esi,[triangles_ptr]
   xor      edx,edx
  .nx_tri_loop:
   movups   xmm1,[esi]
   pcmpeqd  xmm1,xmm0   ; xmm0 - broadcasted vert index
   movmskps eax,xmm1
   and      eax,111b
   or       eax,eax
   jnz      @f
  .ntri:
   inc      edx
   add      esi,12
   cmp      edx,[triangles_count_var]
   jnz      .nx_tri_loop
   jmp      .nx_ed
  @@:
   ; tri found
   movups   xmm1,[esi]
   movaps   xmm2,xmm1   ;[esi]
   movaps   xmm3,xmm1
   shufps   xmm1,xmm1,01000001b
   shufps   xmm2,xmm2,10000010b
   shufps   xmm3,xmm3,01101001b
   xor      ecx,ecx
   mov      edi,[edges_ptr]
 .check:
   movlps   xmm4,[edi]
   shufps   xmm4,xmm4,01000100b
   movaps   xmm5,xmm4
   movaps   xmm6,xmm4
   pcmpeqd  xmm4,xmm1
   pcmpeqd  xmm5,xmm2
   pcmpeqd  xmm6,xmm3

   movmskps eax,xmm4
;   mov      ebx,eax
;   and      eax,11b
;   shr      ebx,2
;   cmp      ebx,11b
;   jz       .ed_found
;   cmp      eax,11b
;   jz       .ed_found
;   movmskps eax,xmm5
;   mov      ebx,eax
;   and      eax,11b
;   shr      ebx,2
;   cmp      ebx,11b
;   jz       .ed_found
;   cmp      eax,11b
;   jz       .ed_found
;   movmskps eax,xmm6
;   mov      ebx,eax
;   and      eax,11b
;   shr      ebx,2
;   cmp      ebx,11b
;   jz       .ed_found
;   cmp      eax,11b
;   jz       .ed_found

   popcnt   eax,eax
   cmp      eax,2
   jz       .ed_found
   movmskps eax,xmm5
   popcnt   eax,eax
   cmp      eax,2
   jz       .ed_found
   movmskps eax,xmm6
   popcnt   eax,eax
   cmp      eax,2
   jz       .ed_found
  .no_int_ed:
   add      edi,8
   inc      ecx
   cmp      ecx,[edges_count]
   jnz      .check

  ; now all edges was checked no one intersect,so we mark triangle
   movlps   [esi],xmm0
   movss    [esi+8],xmm0
 ; no forget about removing non ris from list in the end of proc !!
   jmp      .ntri  ; Did any another tri with inside
                   ; vert and without intersect ed exist ?    ;.nx_ed
  .ed_found:
   mov      ebx,ecx
   mov      eax,ecx
   and      ebx,111b
   shr      eax,3
   add      eax,.clidd_ed
   bt       [eax],ebx
   jnc      .no_int_ed   ; this ed no intersect
  .nx_ed:
   pop      ecx
   inc      ecx
   cmp      ecx,[points_count_var]
   jnz      .main_loop
   call     remove_non_tri
   mov      esp,ebp
   pop      ebp
ret
;================================================================================
clip_triangles:
; clipping triangles not beeing in teslate area
; so I am looking into tri rotated list...
   push     ebp
   mov      ebp,esp
   sub      esp,16
   .tri_ptr equ dword[ebp-4]
   .cntt    equ dword[ebp-8]
   .ptr     equ dword[ebp-12]
   mov      eax,[points_rotated_ptr]
   mov      .ptr,eax
   mov      eax,[triangles_count_var]
   imul     eax,12
   add      eax,20
   malloc   eax
   mov      .tri_ptr,eax

   mov      edi,eax
   xor      ebx,ebx
   mov      esi,[triangles_ptr]
   mov      ecx,[triangles_count_var]
   mov      .cntt,ebx
   cld
   movaps   xmm7,[tri_area_x1]
  .l:
   mov      eax,[esi]
   mov      ebx,[esi+4]
   mov      edx,[esi+8]
   imul     eax,12
   imul     ebx,12
   imul     edx,12
   add      eax,.ptr
   add      ebx,.ptr
   add      edx,.ptr
   movlps   xmm0,[eax]
   movlps   xmm1,[ebx]
   movlps   xmm2,[edx]
   movlhps  xmm0,xmm0
   movlhps  xmm1,xmm1
   movlhps  xmm2,xmm2
   cmpltps  xmm0,xmm7
   cmpltps  xmm1,xmm7
   cmpltps  xmm2,xmm7
   movhlps  xmm3,xmm0
   movhlps  xmm4,xmm1
   movhlps  xmm5,xmm2
   xorps    xmm0,xmm3
   xorps    xmm1,xmm4
   xorps    xmm2,xmm5
   andps    xmm0,xmm1
   andps    xmm0,xmm2
   movmskps eax,xmm0
   and      eax,11b
   cmp      eax,11b
   jne      @f
   movsd
   movsd
   movsd
   inc      .cntt
   loop     .l
   jecxz    .enl
  @@:
   add      esi,12
   loop     .l
  .enl:
   mov      ecx,.cntt
   cmp      ecx,[triangles_count_var]
   je       @f
   mov      [triangles_count_var],ecx
   mfree    [triangles_ptr]
   push     .tri_ptr
   pop      [triangles_ptr]
   jmp      .end
  @@:
   mfree    .tri_ptr
 .end:
   mov      esp,ebp
   pop      ebp
ret
;===========================================================
mark_inner_vert:
;  Find vertices inside object,
;  or find tris outside object,
;  max 16 threads.
;  in:  intialised triangles and vertices list
;        if ax = 'ic' -> mark vertices inside concrete chunk
;          than ecx =  chunk number
;         
;  out:
;            ecx  = inside vertices number
;            ebx  = pointer to list, each vertex as bit,
;                    if bit is zeroed, vertex is inside
     push    ebp
     mov     ebp,esp
     .coresc              equ dword [ebp-4]
     .count_of_particles  equ dword [ebp-8]
   ;  .chu_no_and         equ dword [ebp-12]
     prompt  prompt_mark_inn_vr
     mov     ebx,[CoresCount]
     and     ebx,0xf
     push    ebx ; .coresc
     mov     ebx,[points_count_var]
     or      edx,-1
     shl     ecx,8
     shl     edx,8
     push    ebx ; .count_of_particles
   ;  push    ecx
     cmp     ax,'ic'
     cmove   edx,ecx
     mov     eax,mark_inner_vert_th
     mov     ecx,.coresc
; in eax  -  proc-th adress
;    edx -   param bitewise 'and' with param passed to proc_th
     call    call_thread
     xor     ecx,ecx
     mov     ebx,.coresc
     dec     ebx
   .l1:
     mov     edi,[thread_params]
     push    ecx
     mov     esi,[thread_params+4+4*ecx]
     mov     ecx,.count_of_particles  
     shr     ecx,7
     inc     ecx
   @@:
     movaps  xmm0,[esi]
     orps    xmm0,[edi]
     movaps  [edi],xmm0
     add     edi,16
     add     esi,16
     loop    @b
     pop     ecx
     inc     ecx
     cmp     ecx,ebx
     jnz     .l1
     xor     ecx,ecx
     mov     ebx,.count_of_particles 
    .cnt:
     push    ecx
     mov     edx,ecx
     shr     edx,8
     and     ecx,111b
     add     edx,[thread_params]
     bt      [edx],ecx
     sbb     ebx,0
     pop     ecx
     inc     ecx
     cmp     ecx,.count_of_particles  ;[points_count_var]
     jnz     .cnt

     mov     ecx,.coresc
     mov     edx,ecx
     dec     ecx
    .free:
     push    ecx
     push    edx
     mov     edi,[thread_params+edx+ecx*4]
     mfree   edi
     pop     edx
     pop     ecx
     loop    .free
     cls
     mov     ecx,ebx  ;.in_vert_count
     mov     ebx,[thread_params]; ptr to list
     mov     esp,ebp
     pop     ebp
ret
;=======================================================================
fix_normals:
   push  ebp
   mov   ebp,esp
   sub   esp,150
   sub   ebp,32
   .z_buff              equ dword[ebp-4]
   .triangles_count_var equ dword[ebp-8]
   .tri_mark            equ dword[ebp-12]
   .crp3                equ dword[ebp-16]
   .do_fix              equ dword[ebp-20]
   .norm_rot            equ dword[ebp-24]
   .irp3                equ dword[ebp-28]
   .t_ptr               equ dword[ebp-32]
   .siny                equ dword[ebp-36]
   .cosy                equ dword[ebp-40]
   .sinz                equ dword[ebp-44]
   .cosz                equ dword[ebp-48]
   .points_rot          equ dword[ebp-52]
   .side1               equ dword[ebp-56]
   .side2               equ dword[ebp-60]
   .matrixx             equ      [ebp-104]
   .fx_occ              equ dword[ebp-108] ; at last one fix occured
   .shll                equ      [ebp-112]
   .y_min               equ      [ebp]     ; y_min, y_max, x_min, x_max

   mov      .crp3,0.333333    ; [constrecip3]
   mov      .shll,dword stencil_line
   mov      .irp3, 0x55555555  ; [irecipr3]
   push     [triangles_ptr]
   pop      .t_ptr
   push     [triangles_count_var]
   pop      .triangles_count_var
   ; Alloc  1000x1000  buffs
   mov      eax,1050*1050*4+65536
   malloc   eax
   mov      .z_buff,eax
   mov      eax,[points_count_var]
   imul     eax,12
   add      eax,30
   malloc   eax
   mov      .points_rot,eax
   mov      eax,.triangles_count_var
   imul     eax,12
   add      eax,30
   malloc   eax
   mov      .norm_rot,eax
   mov      ebx,.triangles_count_var
   add      ebx,150
   shr      ebx,3
   malloc   ebx
   mov      .tri_mark,eax
   xorps    xmm1,xmm1
   mov      edx,1000
   movd     xmm1,edx
   shufps   xmm1,xmm1,00110011b
   movups   .y_min,xmm1
   ; mov     edi,eax  ;.tri_mark
   ; xor     eax,eax
   ; mov     ecx,.triangles_count_var
   ; shr     ecx,2
   ; inc     ecx
   ; cld
   ; rep     stosd
   ; Do paralell projection.
   ; Render faces - only z coord
   rdtsc
   movzx    ebx,al
   shr      eax,8
   movzx    edx,al
   shr      eax,16
   movzx    eax,al
   lea      edi,.matrixx
   call     make_matrixx
   mov      esi,[triangles_normals_ptr]
   mov      edi,.norm_rot
   mov      ecx,.triangles_count_var
   lea      ecx,[ecx*3]
   inc      ecx
   rep      stosd
   mov      esi,[triangles_normals_ptr]
   mov      edi,.norm_rot
   mov      ecx,.triangles_count_var
   lea      ebx,.matrixx
   call     rotary
   lea      esi,.matrixx    ;esi - pointer to 3x3 matrix
   mov      ebx,f490x3      ;ebx - ptr to scale
   call     add_scale_to_matrix
   mov      esi,[points_r_ptr]  ; vertices to project
   mov      edi,.points_rot
   mov      ecx,[points_count_var]
   lea      ebx,.matrixx
   call     rotary
   mov      esi,.points_rot
   mov      ecx,[points_count_var]
   movaps   xmm7,[f500x3]
 @@:
   movups   xmm0,[esi]
   addps    xmm0,xmm7  ;[f500x3]
   movhlps  xmm1,xmm0
   cvtps2dq xmm0,xmm0
   shufps   xmm0,xmm0,11100001b
   movlps   [esi],xmm0
   movss    [esi+8],xmm1      ; z coord still float
   add      esi,12
   loop     @b

   xor      eax,eax
   mov      .fx_occ,eax          ; at last one fix took place
   mov      .do_fix,eax
 .sec:
   mov      ecx,1020*1020
   mov      edi,.z_buff
   mov      eax,6000.1
   cld
   rep      stosd
   mov      esi,[triangles_ptr]
   xor      ecx,ecx
   mov      edx,12
 .rd:  ; render  with writing only to z buff
   xor      ebx,ebx
   push     esi
   push     ecx
   cmp      .do_fix,ebx
   je       @f
   imul     ecx,12         ; cull face
   add      ecx,.norm_rot
   bt       dword[ecx+8],31
   jnc      .skip  ; no render
 @@:
   ; cld
   ; lodsd
   ; xchg   eax,edx
   ; lodsd
   ; xchg   eax,ebx
   ; lodsd
   ; xchg   eax,edx
   mov      eax,[esi]
   mov      ebx,[esi+4]
   mov      ecx,[esi+8]
   imul     eax,12
   imul     ebx,12
   imul     ecx,12
   mov      esi,.points_rot
   ; add    eax,.points_rot
   ; add    ebx,.points_rot
   ; add    ecx,.points_rot
   movlps   xmm0,[eax+esi]
   movlps   xmm1,[ebx+esi]
   movlps   xmm2,[ecx+esi]
   xorps    xmm7,xmm7
   push     dword[esi+ecx+8] dword[esi+ebx+8] dword[esi+eax+8]
   packssdw xmm0,xmm7
   packssdw xmm1,xmm7
   packssdw xmm2,xmm7

   movd     eax,xmm0
   movd     ebx,xmm1
   movd     ecx,xmm2
   mov      edx,1000 shl 16 + 1000
   movd     xmm1,edx
   movups   xmm0,[esp]
   movups   xmm1,.y_min
   add      esp,12
   mov      esi,.z_buff
   movlps   xmm5,.shll  ; stencil_line
   call     stencil_tri
 .skip:
   pop      ecx
   pop      esi
   add      esi,12
   inc      ecx
   cmp      ecx,.triangles_count_var
   jne      .rd
   mov      esi,.t_ptr ;[triangles_ptr]    ; check witch are rendered /at front/
   xor      ecx,ecx
   mov      eax,1.3
   movd     xmm7,eax   ;[eps3]  ;                epsilon1]
   shufps   xmm7,xmm7,0
 .check_in:
   push     esi
   mov      eax,[esi]
   mov      ebx,[esi+4]
   mov      edx,[esi+8]
   imul     eax,12
   imul     ebx,12
   imul     edx,12
   add      eax,.points_rot
   add      ebx,.points_rot
   add      edx,.points_rot
   movss    xmm0,[eax+8]
   addss    xmm0,[ebx+8]
   addss    xmm0,[edx+8]
   mulss    xmm0,.crp3
   movlps   xmm1,[eax]
   movlps   xmm2,[ebx]
   movlps   xmm3,[edx]
   paddd    xmm1,xmm2
   paddd    xmm1,xmm3
   sub      esp,8
   movlps   [esp],xmm1
   pop      eax esi
   ; movd    eax,xmm1
   ; psrldq  xmm1,4
   ; movd    esi,xmm1
   mov      ebx,.irp3
   mul      ebx
   push     edx         ; edx - y
   mov      eax,esi
   mul      ebx         ; edx - x
   pop      eax
   mov      ebx,1000
   imul     eax,ebx      ; y
   add      eax,edx
   and      eax,0xfffff
   shl      eax,2
   add      eax,.z_buff
   movlps   xmm6,[eax]
   movaps   xmm1,xmm0
   addps    xmm1,xmm7
   subps    xmm0,xmm7
   cmpltps  xmm1,xmm6
   cmpltps  xmm0,xmm6
   xorps    xmm0,xmm1
   movd     eax,xmm0
   movd     xmm5,ecx
   mov      ebx,ecx
   shr      ebx,3
   and      ecx,111b
   add      ebx,.tri_mark
   xor      esi,esi
   cmp      .do_fix,1
   je       @f
   or       eax,eax
   jz       @f
   bts      word[ebx],cx      ; triangle with appropiate bit set -
  @@:                         ; was rendered /in front/
   cmp      .do_fix,esi
   je       @f
   ; check if rendered now
   ; not - skip forward
   bt       word[ebx],cx   ; was rendered (in z buff) previously ?
   jnc      @f
   or       eax,eax        ; is now ren ?
   jz       .fix_norm
                           ; yes - fix normal vect
 @@:
 .nxchck:
   movd     ecx,xmm5
   inc      ecx
   pop      esi
   add      esi,12
   cmp      ecx,.triangles_count_var
   jnz      .check_in
   inc      .do_fix
   cmp      .do_fix,2
   jne      .sec                   ; second rendering
                                   ; with enabled backface culling
   jmp     .done
 .fix_norm:
   cld
   movd     esi,xmm5
   imul     esi,12
   add      esi,.t_ptr
   mov      edi,esi
   lodsd           ;swap
   mov      ebx,eax
   movsd
   mov      eax,ebx
   stosd
   inc      .fx_occ
   jmp      .nxchck
  .done:
  @@:
   mfree    .norm_rot
   mfree    .points_rot
   mfree    .z_buff
   mfree    .tri_mark
   cls
   add      esp,150
   pop      ebp
ret

;======================================================================
mark_inner_vert_th:
; in :  ebx - thread No
;   rem  if bit 31 of ebx is set means = mark outside tris
; out:
;       dword[thread_params+thread No*4] -  ptr to list
;       to inside vertices, if vertex is inside, adjacent bit is zeroed.
; Find vertices inside object
   push  ebp
   mov   ebp,esp
   sub   esp,260
   sub   ebp,128
   .f500x3         equ dword[ebp-4]
   .inner_vert     equ dword[ebp-8]  ; ptr to desired list
 ; .outside_tri    equ dword[ebp-8]  ; once again the same
   .in_vert_count  equ dword[ebp-12]
   .side0          equ dword[ebp-16]
   .mark_outs_tris equ byte [ebp-20]
;  .do_fix         equ dword[ebp-20]
   .norm_rot       equ dword[ebp-24]
   .sinx           equ dword[ebp-28]
   .cosx           equ dword[ebp-32]
   .siny           equ dword[ebp-36]
   .cosy           equ dword[ebp-40]
   .sinz           equ dword[ebp-44]
   .cosz           equ dword[ebp-48]
   .points_rot     equ dword[ebp-52]
   .side1          equ dword[ebp-56]
   .side2          equ dword[ebp-60]
   .end1           equ dword[ebp-64]
   .end2           equ dword[ebp-68]
   .step           equ dword[ebp-72]
   .cntt           equ dword[ebp-100]
   .norm_to_rev    equ dword[ebp-104]
   .th_no          equ dword[ebp-108]
   .start0         equ dword[ebp-112]
   .start1         equ dword[ebp-116]
   .start2         equ dword[ebp-120]
   .end0           equ dword[ebp-124]
   .mark           equ word [ebp-126]

   .matrix              equ      [ebp]
   .eps3                equ      [ebp+40]
   .i1000               equ dword[ebp+44]
   .i4000               equ dword[ebp+48]
   .f490x3              equ dword[ebp+52]
   .points_count_var    equ dword[ebp+60]
   .triangles_count_var equ dword[ebp+64]
   .points_r_ptr        equ dword[ebp+68]
   .triangles_ptr       equ dword[ebp+72]
   .ymin                equ      [ebp+76]
   .ymax                equ      [ebp+80]
   .xmin                equ      [ebp+84]
   .xmax                equ      [ebp+88]
   .shl1000             equ      [ebp+92]
   .shll                equ      [ebp+96]
;  .blank               equ      [ebp+100]
   .z_buff              equ      [ebp+104]
   .chu_no              equ      [ebp+108]

; Work for marking vertices inside concrete separate chunk.
; If all hgst 24 bit ofebx  are set means standard 'no chunk'
; work. If one of all hgst 24 bit of eax = 0, those bit are
; number of chunk. Mark verts inside this chu. And inner verts
; possesed by this chunk.
   movzx    eax,bl
   mov      .th_no,eax
   shr      ebx,8
   mov      .chu_no,ebx
   mov      dx,'iv'
   mov      ax,'ic'
   cmp      ebx,0x00ffffff
   cmove    eax,edx
   mov      .mark,ax
   cld
   lea      esi,[points_count_var]
   lea      edi,.points_count_var
   movsd
   movsd
   movsd
   movsd

   push     [eps3]
   pop      dword .eps3
   mov      ebx,stencil_line
   mov      eax,1000
   xor      edx,edx
   mov      .shll,ebx
   mov      .ymin,edx
   mov      .xmin,edx
   mov      .ymax,eax
   mov      .xmax,eax
   mov      .i1000,eax
   shl      eax,2
   mov      .i4000,eax
   mov      .f490x3,f490x3
   mov      .f500x3,f500x3
   mov      .shl1000,dword (1000 shl 16 + 1000)
; Alloc 1000x1000  buffs
   mov      eax,1050*1050*4+65536
   malloc   eax
   mov      .z_buff,eax
   
   mov      ebx,.points_count_var
   shr      ebx,3
   add      ebx,30
   malloc   ebx
   mov      .inner_vert,eax
   shl      ebx,3+2+5
   lea      ebx,[ebx*3]
   malloc   ebx
   mov      .points_rot,eax
   mov      edi,.inner_vert
   mov      ecx,.points_count_var
   shr      ecx,5+2
   inc      ecx
   or       ebx,-1
   xor      eax,eax
   cmp      .marker,'ic'
   cmove    eax,ebx
   cld
   rep      stosd

   mov      ecx,[CoresCount]
   and      ecx,0xf
   mov      ebx,.th_no
   mov      eax,256
   cdq
   idiv     ecx
   mov      .end0,eax
   imul     ebx,eax
   mov      .start0,ebx
   add      .end0,ebx
   mov      .step,24
; Do paralell projection.
; Render faces - only z coord
   push     .start0
   pop      .side0
   ; mov    .side0,0
 .next_side0:
   xor      eax,eax
   mov      .side1,eax
 .next_side1:
   xor      eax,eax
   mov      .side2,eax
 .next_side2:
   mov      eax,.side0
   mov      ebx,.side1
   mov      edx,.side2
   lea      edi,.matrix
   call     make_matrixx
   lea      esi,.matrix     ;esi - pointer to 3x3 matrix
   mov      ebx,.f490x3     ;ebx - ptr to scale
   call     add_scale_to_matrix
   mov      esi,.points_r_ptr  ; vertices to project
   mov      edi,.points_rot
   mov      ecx,.points_count_var
   lea      ebx,.matrix
   call     rotary
   mov      esi,.points_rot
   mov      ecx,.points_count_var
   mov      edi,.f500x3
 @@:
   movups   xmm0,[esi]
   addps    xmm0,[edi]
   movhlps  xmm1,xmm0
   movlps   [esi],xmm0
   movss    [esi+8],xmm1
   add      esi,12
   loop     @b
   mov      ecx,1020*1020
   mov      edi,.z_buff
   mov      eax,6000.1
   cld
   rep      stosd
   movups   xmm1,.shl1000
   mov      edx,.points_rot
   mov      ax,.mark
   cmp      ax,'ic'     ; 'ic' = Inner vertices of concrete Chunk
   cmove    ebx,.chu_no
   movups   xmm5,.ymin
   call     do_stencil
   mov      esi,.points_rot
   xor      ecx,ecx
   movlps   xmm7,.eps3  ; epsilon
   shufps   xmm7,xmm7,0
 .nxcnt:
   push     ecx
   mov      edi,ecx
   movlps   xmm1,[esi]
   cvtps2dq xmm1,xmm1
   movlps   [esi],xmm1
   movlps   xmm0,[esi+8]
   shufps   xmm0,xmm0,0
   mov      eax,[esi+4] ; y coord
   mov      ebx,[esi]   ; x
   imul     eax,.i1000
   add      ebx,eax
   shl      ebx,2
   and      ebx,0x3FFFFF
   add      ebx,.z_buff
   cmp      [ebx],dword 6000.1
   je       .mrk
   movups   xmm1,[ebx]
   movups   xmm2,[ebx]
   cmp      .marker,'ic'
   jne      .noic
   mov      eax,.chu_no
   shl      eax,4
   add      eax,[chunks_desc_ptr]
   ;  + 0 - count of tris in chunk
   ;  + 4 - count of vertices in chunk
   ;  + 8 - vertices offset
   ;  + 12 - triangles_offset
   cmp      ecx,[eax+4]
   jb       @f          ; Solution for marking verts inside single chunk:
   mov      edx,[eax+8] ;  - project this single chunk into Z stencil buff
   add      edx,[eax+4] ;  - check buff - if its floor - mark,
   cmp      ecx,edx     ;               - if not floor - check if curr vert
   jb       .skp        ;                 is below curr pix.
 @@:                    ; This way all verts inside .chu_no chunk and
   cmpltss  xmm0,xmm1   ; all inside verts that .chu_no possesed are marked.
   jb       .skp        ; !!Chunks should be sorted, use 'sort chunk' button!!
   jmp      .mrk
 .noic:
   addps    xmm1,xmm7
   subps    xmm2,xmm7
   cmpltps  xmm1,xmm0
   cmpltps  xmm2,xmm0
   xorps    xmm2,xmm1
   movmskps eax,xmm2
   and      eax,111b
   or       eax,eax
   jz       .skp
 .mrk:
   mov      edx,ecx
   shr      edx,3
   and      ecx,111b
   add      edx,.inner_vert
   cmp      .marker,'ic'
   je       .cln
   bts      [edx],ecx  ; if bit is set vertex is not inside
   jmp      .skp
 .cln:
   btr      [edx],ecx
 .skp:
   pop      ecx
   add      esi,12
   inc      ecx
   cmp      ecx,.points_count_var
   jnz      .nxcnt

 .end_steps:
   mov      ecx,.step
   add      .side2,ecx
   ; mov    ebx,.end2
   cmp      .side2,256 ;ebx
   jng      .next_side2
   add      .side1,ecx
   ; mo     ebx,.end1
   cmp      .side1,256  ;ebx
   jng      .next_side1
   add      .side0,ecx
   mov      ebx,.end0
   cmp      .side0,ebx
   jng      .next_side0
   mov      ecx,.inner_vert
 @@:

   mov      ebx,.th_no
   shl      ebx,2
   add      ebx,thread_params
   mov      [ebx],ecx
   mfree    .points_rot
   mfree    .z_buff
   cls
   add      esp,260
   pop      ebp
ret
;===============================================================
;===============================================================
;===============================================================
do_edges_list:
; in:
; in  - some global variables and
;       eax = 'opt' - optimize chunks
; out:   values  / strucures
;             [chunk_desc_Ex_ptr]
;             [edges_ptr]
;             [edges_count]
;             [edge_s_d_ptr]
;             [edges_d_count]
;             [greates_chunk]
;   structure with edges::
;          mov     eax,[chunk_desc_Ex_ptr]
;          dword [eax] -  edges count in chunk
;          dword [eax+4] - edges offset
    push    ebp
    mov     ebp,esp
    sub     esp,100
    .ed_cnt              equ [ebp-4]
    .ed_ptr              equ [ebp-8]
    .counter             equ [ebp-12]
    .ed1_ptr             equ [ebp-16]
    .chunks_Ex_ptr       equ [ebp-20]
    .ed_s_d              equ [ebp-24]      ; edge - single / double
    .ed_sd_cnt           equ dword[ebp-28] ; count of double edges
    .triangles_count_var equ dword[ebp-32]
    .triX3               equ dword[ebp-36]
    .cur_chunk_no        equ dword[ebp-40]
    .cur_tri_ptr         equ dword[ebp-44]
    .cur_ed_ptr          equ dword[ebp-48]
    .cur_chunk_ptr       equ dword[ebp-52]
    .cur_chunk_counter   equ dword[ebp-56] ; tris in curr chunk
    .ed2_ptr             equ dword[ebp-60]
    .edges_acc           equ dword[ebp-64]
    .chunks_count        equ dword[ebp-68]
    .chunks_ptr          equ dword[ebp-72]
    .chu_end             equ dword[ebp-76]
    .do_opt              equ dword[ebp-80]

    mov        .do_opt,eax
    mov        ecx,[chunks_count]  ;,ecx
    mov        .chunks_count,ecx
    mov        eax, [triangles_count_var]
    mov        .triangles_count_var,eax
    lea        eax,[eax*3]
    mov        .triX3,eax
    ; do edges according to chunk - every chunk coninous part in edges list
    prompt     prompt_edges
    mfree      [edges_ptr]
    mfree      [edge_s_d_ptr]
    mfree      [chunk_desc_Ex_ptr]
    mov        esi,[chunks_ptr]
    mov        ecx,[triangles_count_var]
    mov        .chunks_ptr,esi
    mov        .triangles_count_var,ecx
    add        ecx,ecx
    add        ecx,esi
    dec        ecx
    mov        .chu_end,ecx
    mov        ecx,.triangles_count_var
    mov        ebx,ecx
    cmp        .chunks_count,1
    je         .en1

    cld
    xor        edx,edx
    xor        ebx,ebx
  .lab1:
    lodsw
    cmp        ax,[esi]
    je         @f
    cmp        ebx,edx
    cmovb      ebx,edx
    xor        edx,edx
    loop       .lab1
    jmp        .en1
   @@:
    inc        edx
    loop       .lab1
 .en1:
    mov        [greatest_chunk],ebx
    add        ebx,100
    ; shl        ebx,3
    imul       ebx,36
    malloc     ebx
    mov        .ed1_ptr,eax
    malloc     ebx
    mov        .ed2_ptr,eax
    mov        ebx,.triX3
    add        ebx,100
    imul       ebx,12
    ; shl        ebx,3
    malloc     ebx
    mov        .ed_ptr,eax         ;  ed_ptr - whole edges list
    ; mov        [edges_ptr],eax     ;
    mov        .cur_ed_ptr,eax     ;
    push       ebx
    mov        eax,.chunks_count
    add        eax,8
    shl        eax,6
    malloc     eax
    mov        [chunk_desc_Ex_ptr],eax
    mov        .chunks_Ex_ptr,eax
    pop        ebx
    shr        ebx,5
    malloc     ebx
    mov        .ed_s_d,eax
    mov        ecx,ebx
    mov        edi,eax
    shr        ecx,2
    inc        ecx
    or         eax,-1
    cld
    rep        stosd
    xor        edx,edx
    mov        .cur_chunk_no,1
    mov        .edges_acc,edx
    mov        eax,[triangles_ptr]
    mov        edx,.chunks_ptr
    mov        .cur_tri_ptr,eax
    mov        .cur_chunk_ptr,edx
    mov        ecx,.chunks_count
    ; inc      ecx
    mov        dword[edx],0x00010001
  .chunk_loop:
    push       ecx
    xor        eax,eax
    mov        .cur_chunk_counter,eax    ; tri no in curr chunk
    mov        ebx,.cur_chunk_no
    mov        esi,.cur_chunk_ptr
    mov        edi,.ed1_ptr
    mov        edx,.cur_tri_ptr
    cld
  .b:
    cmp        esi,.chu_end
    jae        .br2
    lodsw
    cmp        .chunks_count,1   ; one chunk cause
    je         .skp              ; maybye sort one chunk object also ??
                                 ;  - to detect possible holes in chunks_ptr struct
    cmp        bx,ax
    jne        .f
  .skp:
    inc        .cur_chunk_counter
    movups     xmm0,[edx]
    movlps     [edi],xmm0
    shufps     xmm0,xmm0,10001001b
    movups     [edi+8],xmm0
    add        edi,24
    add        edx,12
    jmp        .b
  .f:
    cmp        .cur_chunk_counter,0
    jz         .b
  .br2:
    sub        esi,2
    ; mov      .cur_chunk_no,ebx
    mov        .cur_chunk_ptr,esi
    mov        .cur_tri_ptr,edx  ;[triangles_ptr]
    mov        ebx,.ed1_ptr
    mov        ecx,.cur_chunk_counter
    or         ecx,ecx
    jz         .br3
    ; add        ecx,ecx
    lea        ecx,[ecx*3]
  .mxd:
    mov        eax,[ebx]
    mov        edx,[ebx+4]
    cmp        eax,edx
    jb         .nx_ed_a
    mov        [ebx],edx
    mov        [ebx+4],eax
  .nx_ed_a:
    add        ebx,8
    loop       .mxd

    mov        esi,.ed1_ptr  ;[edges_ptr]
    mov        edi,.ed2_ptr
    mov        ecx,.cur_chunk_counter
    lea        ecx,[ecx*3]
    call       sort_hybrid
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  make edges list
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ; insert, sort again
    mov        ebx,.ed1_ptr  ;[edges_ptr]
    mov        ecx,.cur_chunk_counter
    lea        ecx,[ecx*3]
    mov        esi,ecx
    shl        esi,3
    add        esi,ebx
 .count:
    push       ecx
    push       ebx
    xor        ecx,ecx
    mov        eax,[ebx]   ; count
 .aa:
    inc        ecx
    add        ebx,8
    cmp        ebx,esi
    jae        .br         ; break
    cmp        eax,[ebx]
    je         .aa
    mov        .counter,ecx
    pop        ebx
    mov        edi,ebx
    sub        edi,8
    mov        edx,[ebx+8]
 .ccc2:
    cmp        ebx,esi
    jbe        @f
    add        esp,4
    jmp        .ff
 @@:
    mov        eax,[ebx+12]
    mov        edx,[ebx+8]
    cmp        eax,[ebx+4]
    jge        .gg2
    movlps     xmm0,[ebx+8]
    push       ebx
 .c2:
    cmp        eax,[ebx+4]
    jae        .done2
    movlps     xmm7,[ebx]
    movlps     [ebx+8],xmm7
    sub        ebx,8
    cmp        ebx,edi
    jz         @f
    cmp        [ebx+8],edx
    jz         .c2
 @@:
 .done2:
    add        ebx,8
    movlps     [ebx],xmm0
 .p2:
    pop        ebx
 .gg2:
    add        ebx,8
    dec        ecx
    cmp        ecx,1
    jnz        .ccc2
    pop        ecx
    sub        ecx,.counter
    add        ebx,8
    ja         .count
    jmp        .ff
 .br: ;break
    add        esp,8
 .ff:
    ; count edges
    mov        ecx,.cur_chunk_counter
    ; or       ecx,ecx
    ; jz       .nx_chunk
    lea        ecx,[ecx*3]
    mov        esi,.ed1_ptr
    mov        edi,.ed2_ptr
    xor        edx,edx
    cld
 .nx:
    movlps     xmm0,[esi]
    add        esi,8
    movlps     xmm1,[esi]
    pcmpeqd    xmm1,xmm0
    movmskps   eax,xmm1
    and        al,11b
    cmp        al,11b
    je         .ff2
    movlps     [edi],xmm0
    add        edi,8
    inc        edx
    loop       .nx
    jmp        .end_loop2
 .ff2:
    ; double
    push       ecx
    push       edx
    inc        .ed_sd_cnt        ; count double edges
    add        edx,.edges_acc
    mov        ecx,edx
    shr        edx,3
    and        ecx,111b
    add        edx,.ed_s_d
    btr        dword[edx],ecx    ; if edge is doubled -  bit is zeroed
    pop        edx
    pop        ecx
    loop       .nx
 .end_loop2:

    mov        .ed_cnt,edx
    mov        eax,.cur_chunk_no
    shl        eax,6
    add        eax,.chunks_Ex_ptr  ;.chunk_desc_Ex_ptr
    mov        [eax],edx           ; edges count in chunk
    mov        ebx,.edges_acc
    mov        [eax+4],ebx         ; offset
    add        .edges_acc,edx      ; increase accumulator
    mov        edi,.cur_ed_ptr
    mov        esi,.ed2_ptr
    mov        ecx,edx
    add        ecx,ecx
    cld
    rep        movsd
    mov        .cur_ed_ptr,edi
 .nx_chunk:
    inc        .cur_chunk_no
    pop        ecx
    dec        ecx
    jnz        .chunk_loop
    jmp        .en3
 .br3:
    pop        eax
 .en3:
    mfree      .ed1_ptr
    mfree      .ed2_ptr
    ; realloc   if nedeed
    mov        eax,.ed_ptr
    mov        ecx,.triX3
    mov        ebx,.edges_acc
    sub        ecx,ebx
    cmp        ecx,40000      ; border value
    jna        .no_realloc
    add        ebx,100
    shl        ebx,3
    malloc     ebx
    mov        .ed1_ptr,eax   ; ed1_ptr variable - reused
    mov        edi,eax
    mov        esi,.ed_ptr
    mov        ecx,.edges_acc
    add        ecx,ecx
    cld
    rep        movsd
    mfree      .ed_ptr
    mov        eax,.ed1_ptr
 .no_realloc:
    mov        ecx,.edges_acc  ;.ed_cnt
    mov        edx,.ed_s_d
    mov        ebx,.ed_sd_cnt
    mov        [edges_ptr],eax
    mov        [edges_count],ecx
    mov        [edge_s_d_ptr],edx
    mov        [edges_d_count],ebx        ; double edges
    cls
    mov        esp,ebp
    pop        ebp
ret
;===========================================================================
;===========================================================================
;===========================================================================
remove_redundant_vert_ch:  ; remove redundand vertices
; Merge  vertices, procs always check if vertices to merge are in tesselate area
; in:
;     registers:
;        eax  = approximation tolerancy (as dword float)
;        xmm0 = according to vertex normal vectors tolerancy
;        ebx - mask - if 'Z' coord is importand = 111b, if not imp = 11b
;        ecx - 3rd bit is set (bt .val,2) - only one chunk vert merging,
;              if so than edx = chumk number/id
;        ecx - 4th  bit set (bt .val,3) - not destroy manifold structure
;              of object
;        ecx - 5th  bit set = from tex mark
;
;              !! If 3rd or 4th bit is set in ecx ->                !!
;              !! -> Always sort and optimize chunks, so every      !!
;              !! chunk must have its own coninous area of vertices !!
;              !! list                                              !!
;     globals:
;        points_r_ptr, triangles_ptr, points_normals_ptr
;        points_count_var, triangles_count_var
;
; out:
;        recomputed vertices (points) and faces (triangles) list
   push        ebp
   mov         ebp,esp
   sub         esp,270
   and         ebp,-16
   sub         ebp,96
   .mask                 equ dword[ebp-4]
   .tri_ch               equ      [ebp-8]
   .t_ptr                equ      [ebp-12]
   .counter              equ      [ebp-16]
   .aprox_vert           equ      [ebp-32]     ; tolerancy
   .aprox_norm           equ      [ebp-48]
   .tri_area_x2          equ      [ebp-56]
   .tri_area_x1          equ      [ebp-64]

   .points_to_process    equ dword[ebp-112]
   .marker               equ dword[ebp-108]
   .chunk_no             equ dword[ebp-104]
   .ccnt2                equ dword[ebp-100]
   
   .msk3                 equ dword[ebp-96]
   .mask2                equ dword[ebp-92]
   .points_count_var     equ dword[ebp-88]
   .triangles_count_var  equ dword[ebp-84]
   .points_r_ptr         equ dword[ebp-80]
   .triangles_ptr        equ dword[ebp-76]
   .points_rotated_ptr   equ dword[ebp-72]
   .points_n_rotated_ptr equ dword[ebp-68]

   .mxx                  equ [ebp]
   .ver_h                equ [ebp+40]
   .ver_l                equ [ebp+44]
   .v_h_cur              equ [ebp+48] ; current chunk variables
   .v_l_cur              equ [ebp+52] ; vars used in non destroy
                                      ; manifold chunk structure
                                      ; mode
    mov        .msk3,ebx
    mov        .marker,ecx
    movaps     .aprox_norm,xmm0       ; tolerancy deviation of normal vect
    mov        .chunk_no,edx
    movd       xmm7,eax
    shufps     xmm7,xmm7,0
    movaps     .aprox_vert,xmm7
    cld
    lea        esi,[points_count_var]  ; init some locals
    lea        edi,.points_count_var
    movsd
    movsd
    movsd
    movsd
    movsd
    movsd

    mov        eax,.points_rotated_ptr
    mov        .points_to_process,eax
    movups     xmm0,[tri_area_x1]   ; mxx temp in tri_area mem
    movups     .tri_area_x1,xmm0    ;
    bt         .marker,4            ; from tex mark
    jnc        .no_rev
    mov        eax,-0.9
    push       eax eax
    mov        eax,0.9
    push       eax eax
    movups     xmm0,[esp]
    movups     .tri_area_x1,xmm0
    add        esp,16
    push       esi edi
    mov        esi,matrix_scaled
    lea        edi,.mxx
    call       reverse_mx_3x3
;    cvtdq2ps   xmm3,[xxadd]
;    mov        esi,.points_rotated_ptr
;    mov        edi,.points_r_ptr
;    mov        ecx,.points_count_var
;    andps      xmm3,[zero_hgst]
;  .trans4:
;    movups     xmm0,[esi]
;    subps      xmm0,xmm3
;    movups     [edi],xmm0
;    add        edi,12
;    add        esi,12
;    loop       .trans4
    mov        esi,.points_r_ptr
    mov        ecx,.points_count_var
    mov        edi,.points_rotated_ptr
    lea        ebx,.mxx
    call       rotary
    pop        edi esi
  .no_rev:

    mov        ebx,.points_count_var
    add        ebx,30
    shr        ebx,3
    malloc     ebx
    mov        .mask,eax
    malloc     ebx
    mov        .mask2,eax
    mov        eax,1            ; export esi, edi
    mov        ebx,0x33333333   ; only pivot
    call       detect_chunks
    ; esi -- tri_ch    ; vertices with triangles list
    ; edi -- t_ptr     ; pointers to tri_ch list
    mov        .tri_ch,esi
    mov        .t_ptr,edi
    bt         .marker,2
    jnc        .no_chunk1   ; chunks checking?
    ; chunks must be recognized
    ; struct [chunks_desc_ptr] must be filled
    mov        eax,.chunk_no
    lea        ebx,.ver_h
    call       get_min_max_vert
    ; mov      eax,[chunks_desc_ptr]
    ; [eax]    = tri  count
    ; [eax+4]  = vert count in curr chun
    ; [eax+8]  = vert offset
    ; [eax+12] = tri offset
  .no_chunk1:
    prompt     prompt_red_vert

    mov        edi,.mask        ; bit mask
    mov        ecx,.points_count_var
    shr        ecx,3+2
    inc        ecx
    xor        eax,eax
    cld
    rep        stosd
    mov        edi,.mask2       ; bit mask
    mov        ecx,.points_count_var
    shr        ecx,3+2
    inc        ecx
    xor        eax,eax
    cld
    rep        stosd
    ; check if in triangulize(teslate) area

    ; cmp      [set_tri_area],1
    ; jne      .nochck    ; no area check, not now.
    ; bt       .marker,0  ; tri area checking?
    ; jnc      .nochck    ; I assume this check is always
    ; bt       .marker,2  ; particular chunk check ?
    ; jnc      .nochck
    mov        esi,.points_to_process
    xor        ecx,ecx
    movlps     xmm7,.tri_area_x1
    movlps     xmm6,.tri_area_x2
 .chck:
    movups     xmm0,[esi]
    movups     xmm1,[esi]
    cmpltps    xmm1,xmm7
    cmpltps    xmm0,xmm6
    xorps      xmm0,xmm1
    movmskps   eax,xmm0
    and        eax,11b
    cmp        eax,11b
    je         .yes

    mov        edi,ecx
    mov        eax,ecx
    shr        edi,3
    and        eax,111b
    add        edi,.mask
    bts        dword[edi],eax
  .yes:
    add        esi,12
    inc        ecx
    cmp        ecx,.points_count_var
    jne        .chck
  .nochck:
    xor        eax,eax
    mov        esi,.points_to_process
    mov        .counter,eax
  .nx_a1:
    push       esi
    ; if non destr chunk mode
    bt         .marker,3    ; non destr chunks struct
    jnc        @f
    pushad
    mov        edx,.counter
    or         eax,-1
    lea        ebx,.v_h_cur
    call       get_min_max_vert
    popad
  @@:
    mov        edi,.counter
    mov        eax,.counter
    shr        edi,3
    and        eax,111b
    add        edi,.mask
    bt         dword[edi],eax
    jc         .nx_a
    bt         .marker,2    ; one chunk merging
    jnc        @f
    mov        ebx,.counter
    cmp        ebx,.ver_h
    ja         .nx_a
    cmp        ebx,.ver_l
    jb         .nx_a
    cmp        al,10b
    jne        .nx_a
  @@:
    mov        ebx,.counter  ; non destroy manif chunks struct
    cmp        ebx,.v_h_cur
    ja         .nx_a
    cmp        ebx,.v_l_cur
    jb         .nx_a

    movups     xmm6 ,[esi]
    movaps     xmm7,xmm6
    subps      xmm6,.aprox_vert
    addps      xmm7,.aprox_vert
    mov        ebx,.points_to_process
    xor        ecx,ecx
  .nx_b1:
    mov        .ccnt2,ecx
    push       ebx
    cmp        ecx,.counter
    je         .nx_b
    bt         .marker,2
    jnc        @f
    cmp        ecx,.ver_h
    ja         .nx_b
    cmp        ecx,.ver_l
    jb         .nx_b
  @@:
    bt        .marker,3    ; non destr chunks struct
    jnc        @f
    cmp        ecx,.v_h_cur
    ja         .nx_b
    cmp        ecx,.v_l_cur
    jb         .nx_b
   @@:
    mov        edi,ecx
    mov        eax,ecx
    shr        edi,3
    and        eax,111b
    add        edi,.mask
    bt         dword[edi],eax
    jc         .nx_b
    movups     xmm1,[ebx]
    movups     xmm0,[ebx]
    cmpltps    xmm1,xmm6
    cmpltps    xmm0,xmm7
    xorps      xmm0,xmm1
    movmskps   eax,xmm0
    and        eax,.msk3
    cmp        eax,.msk3
    je         .rem_v_in
  .nx_b:

    pop        ebx
    add        ebx,12
    inc        ecx
    cmp        ecx,.points_count_var
    jne        .nx_b1
 .nx_a:
    pop        esi
    inc        dword .counter
    add        esi,12
    mov        ecx,.points_count_var
    cmp        .counter,ecx
    jne        .nx_a1
    jmp        .rm_v      ; remove unuse vert via chunks proc
  .rem_v_in:              ; remove vert index
    mov        edi,ecx
    mov        ebx,ecx
    shl        edi,2
    add        edi,.t_ptr
    mov        edi,[edi]
    bt         .marker,1  ; normal vect checking?
    jnc        .l1
    mov        eax,.counter
    imul       eax,12
    add        eax,.points_n_rotated_ptr
    movups     xmm2,[eax]
    movaps     xmm3,xmm2
    addps      xmm3,.aprox_norm
    subps      xmm2,.aprox_norm
    mov        eax,[edi]
    imul       eax,12
    add        eax,.points_n_rotated_ptr
    movups     xmm4,[eax]
    cmpltps    xmm3,xmm4
    cmpltps    xmm2,xmm4
    xorps      xmm2,xmm3
    movmskps   eax,xmm2
    and        al,111b
    cmp        al,111b
    jne        .nx_b
  .l1:
    mov        esi,[edi+4]        ; [edi+4] - tri  index
    ; cmp      ebx,[edi]          ; [edi]   - vert index
    imul       esi,12
    add        esi,.triangles_ptr
    mov        ecx,3
    ; push       edi
    ; mov        edi,esi
    ; cld
  @@:
    mov        eax,[esi]
    cmp        eax,ebx
    cmove      eax,.counter
    mov        [esi],eax
    add        esi,4
    loop       @b
    ; mov        esi,edi
    ; pop        edi
  .llll:
    add        edi,8
    cmp        [edi],ebx
    je         .l1
    mov        ecx,.ccnt2
    mov        ebx,ecx
    mov        edx,ecx
    and        ebx,111b
    shr        edx,3
    push       edx
    add        edx,.mask
    bts        dword[edx],ebx
    pop        edx
    add        edx,.mask2
    bts        dword[edx],ebx
    jmp        .nx_b
  .rm_v:                       ; fix
    mov        eax,[triangles_ptr]
    mov        ebx,[points_r_ptr]
    mov        ecx,[triangles_count_var]
    mov        edx,[points_count_var]
    call       remove_unused_vertices
    mov        [points_count_var],ecx
  .done:
    mfree      .mask2
    mfree      .mask
    mfree      .tri_ch
    mfree      .t_ptr
    cls        ;macro
    add        esp,270
    pop        ebp
ret
;=========================================================================
;=========================================================================
get_min_max_vert:
;   in  eax = chunks No
;       ebx = .ver_h, .ver_l, definition below
;       if eax = -1, edx = vert No => find current
;       chunk and chunks related min max verts

;   out: filed structure, address in ebx
;   if eax = 0, than no chunkis find, struct not filled
;   proc changes registers: eax, edx, ecx, ebx, esi, xm0
    .ver_h     equ [ebx]
    .ver_l     equ [ebx+4]
    mov        edi,[chunks_desc_ptr]
    movd       xmm0,ebx
    cmp        eax,-1
    jne        .ll
    xor        esi,esi
    mov        ecx,[chunks_count]
    mov        eax,edi
  .oop:
    mov        ebx,[eax+8]  ; v offset
    cmp        edx,ebx
    jl         .nx_ch
    add        ebx,[eax+4]
    cmp        edx,ebx
    jna        .ll1
  .nx_ch:
    inc        esi         ; esi = chunks No
    add        eax,16
    loop       .oop
    xor        eax,eax
    jmp        .end
  .ll1:
    mov        eax,esi
  .ll:
    movd       ebx,xmm0
    shl        eax,4
    add        eax,edi      ;[chunks_desc_ptr]
    mov        edx,[eax+8]  ; v offset
    mov        .ver_l,edx
    add        edx,[eax+4]
    mov        .ver_h,edx
  .end:
ret
;===========================================================================
;===========================================================================
draw_triangulize_rect:
    ;at 1st clear teslate area buff
    push      ebp
    mov       ebp,esp
    sub       esp,8
    .yres     equ [ebp-4]
    .xres     equ [ebp-8]
    movzx     ebx,[xres_var]
    movzx     ecx,[yres_var]
    mov       .xres,ebx
    mov       .yres,ecx
    xor       eax,eax          ; clear
    mov       edi,[edit_tri_area_ptr]
    imul      ecx,ebx
    shr       ecx,4
    cld
    rep       stosd
    movups    xmm0,dqword[tri_area_x1]
    cvtps2dq  xmm0,xmm0
    shufps    xmm0,xmm0,11011000b
    packssdw  xmm0,xmm0
    sub       esp,8
    movlps    [esp],xmm0
    pop       eax ebx

    push      [screen_ptr]
    push      dword 0x0000ff00
    push      dword .xres
    movups    xmm4,[esp]
    add       esp,12

    xorps     xmm6,xmm6
    movlps    xmm7,.xres
    call      write_frame
    mov       ecx,1         ; other const mark
    movlps    xmm0,qword[tri_area_x1]
    mov       edx,.xres
    sub       edx,4
    mov       esi,.yres
    sub       esi,4
    call      bar_tri_area
    xor       ecx,ecx
    movlps    xmm0,qword[tri_area_x2]
    mov       edx,.xres
    sub       edx,4
    mov       esi,.yres
    sub       esi,4
    call      bar_tri_area
    mov       esp,ebp
    pop       ebp
ret
;============================================================
bar_tri_area:
 ;  ecx - signature - 0 / 1
 ;  xmm0 - coordinates
 ;  edx  - yresm4
 ;  esi  - xresm4
    push     ebp
    mov      ebp,esp
    .xresm4  equ ebp-4
    .yresm4  equ ebp-8
    .ed      equ ebp-12
    .add_tb  equ ebp-16
    .add     equ ebp-20
    .xres    equ [ebp-24]
    push     edx
    push     esi
    push     ecx
    movzx    eax,[xres_var]
    mov      ebx,eax
    sub      eax,6
    push     eax
    shl      eax,2
    push     eax
    push     ebx
    cvtps2dq xmm0,xmm0
    sub      esp,8
    movlps   [esp],xmm0
    pop      eax ebx
    ; movd     eax,xmm0
    ; psrldq   xmm0,4
    ; movd     ebx,xmm0
    cmp      eax,5
    jle      .skip
    cmp      ebx,5
    jle      .skip
    cmp      ebx,dword[.yresm4]
    jge      .skip
    cmp      eax,dword[.xresm4]
    jge      .skip
    sub      eax,3
    sub      ebx,3
    xchg     eax,ebx
    mov      edx,.xres
    mul      edx
    add      ebx,eax
    mov      eax,ebx
    shl      eax,2
    mov      edi,eax
    add      edi,[screen_ptr]
    and      dword[.ed],1b
    ; draw bar 6x6
    mov      eax,6
 .oop_ed:
    push     eax
    mov      eax,6
  .oop1:
    push     ebx
    mov      ecx,ebx
    shr      ebx,2
    and      ecx,11b
    add      ebx,[edit_tri_area_ptr]
    mov      dword[edi],0x00ffff00
    xor      edx,edx
    add      ecx,ecx
    inc      ecx
    cmp      dword[.ed],edx
    jne      @f
    bts      [ebx],ecx
    jmp      .f
  @@:
    inc      ecx
    bts      [ebx],ecx
  .f:
    pop      ebx
    inc      ebx
    add      edi,4
    dec      eax
    jnz      .oop1
    add      edi,[.add]
    add      ebx,[.add_tb]
    pop      eax
    dec      eax
    jnz     .oop_ed
  .skip:
    mov      esp,ebp
    pop      ebp
ret
;==========================================================
calc_shadow:  ; calc shadow 4X4 matrix
 ; in esi -> light vector - ptr to 3 dwords float
 ;    ebx -> ptr to 4x4 shadow matrix 16 dwords, 64 bytes
     xor         eax,eax
     movups      xmm1,[plane_equation]
     inc         eax
     movaps      xmm7,xmm1
     cvtsi2ss    xmm3,eax
     movups      xmm0,[esi]
     pslldq      xmm3,12
     andps       xmm0,[zero_hgst]
     orps        xmm0,xmm3  ; [one_hgst_dd1]
     movaps      xmm2,xmm0  ; xmm2 light vector
     dpps        xmm0,xmm1,01110111b
     movaps      xmm4,xmm0  ; xmm4 - dot product
     ; do 4x4 shadow matrix
     movaps      xmm3,xmm7 ;[plane_equation]
     shufps      xmm3,xmm3,0
     mulps       xmm3,xmm2
     movups      [ebx],xmm3
     addps       xmm3,xmm4
     movss       [ebx],xmm3
     movaps      xmm3,xmm7 ;[plane_equation+4]
     shufps      xmm3,xmm3,01010101b
     mulps       xmm3,xmm2
     movups      [ebx+16],xmm3
     shufps      xmm3,xmm3,11100001b
     addps       xmm3,xmm4
     movss       [ebx+20],xmm3
     movaps      xmm3,xmm7  ;[plane_equation+8]
     shufps      xmm3,xmm3,10101010b
     mulps       xmm3,xmm2
     movups      [ebx+32],xmm3
     shufps      xmm3,xmm3,11000110b
     addps       xmm3,xmm4
     movss       [ebx+40],xmm3
     movaps      xmm3,xmm7  ;[plane_equation+12]
     shufps      xmm3,xmm3,11111111b
     mulps       xmm3,xmm2
     movups      [ebx+48],xmm3
     shufps      xmm3,xmm3,00100111b
     addps       xmm3,xmm4
     movss       [ebx+60],xmm3      ; finaly matrix don
ret
;=================================================================
xy_in_rect:
      ; in:
      ; eax - current y shl 16 + x
      ; ecx - y1 shl 16 + x1
      ; edx - y2 shl 16 + x2
      ; out:
      ; ebx = 0 no, ebx = 1 yes
       movd     xmm0,eax
       movd     xmm1,ecx
       movd     xmm2,edx
       pcmpgtw  xmm1,xmm0
       pcmpgtw  xmm2,xmm0
       xorps    xmm2,xmm1
       pmovmskb ebx,xmm2
       cmp      ebx,1111b
       jne      .end_no
       mov      ebx,1
       ret
     .end_no:
       xor      ebx,ebx
ret
;===========================================================
move_texture:
 ;  eax - move x
 ;  ebx - move y
    .x_tex    equ  [ebp-6]
    .y_tex    equ  [ebp-4]
    .texmap   equ  dword[ebp-12]
    .texxy    equ  dword[ebp-16]
    .texx_m   equ  dword[ebp-20]
    .texy_m   equ  dword[ebp-24]
    push      ebp
    mov       ebp,esp
    sub       esp,24
    mov       edx,TEX_X
    mov       .texx_m,edx
    mov       ecx,TEX_Y
    mov       .texy_m,ecx
    imul      ecx,edx
    mov       .texxy,ecx
    mov       .texmap,texmap
    mov       .x_tex,ax
    mov       .y_tex,bx
    or        ebx,ebx
    je        .xx
    jl        .ysub
    mov       eax,ebx
    mov       edx,.texx_m
    mul       edx
    mov       ebx,eax
    shl       ebx,2
    mov       esi,.texmap
    mov       edi,esi
    add       esi,ebx
    cld
    sub       ecx,eax
    rep       movsd
    jmp       .xx
 .ysub:
    mov       esi,.texxy
    shl       esi,2
    add       esi,.texmap
    mov       edi,esi
    neg       ebx
    mov       eax,.texx_m ;TEX_X
    mul       ebx
    lea       ecx,[eax*4]
    sub       esi,ecx
    std
    mov       ecx,.texxy
    sub       ecx,eax
  @@:
    rep       movsd
  .xx:
    xor       edx,edx
    cmp       word .x_tex,dx
    je        .end
    jl        .xadd
    movzx     eax,word .x_tex
    lea       edx,[eax*4]
    cld
    xor       ebx,ebx
    mov       esi,.texmap
    mov       edi,esi
    add       esi,edx
  .next_xsub_l:
    mov       ecx,.texx_m
    sub       ecx,eax
    cld
    rep       movsd
    add       esi,edx
    add       edi,edx
    add       ebx,1
    cmp       ebx,.texy_m
    jne       .next_xsub_l
    jmp       .end
  .xadd:
    movsx     eax, word .x_tex
    neg       eax
    lea       edx,[eax*4]
    xor       ebx,ebx
    mov       esi,.texxy
    shl       esi,2
    add       esi,.texmap
    mov       edi,esi
    sub       esi,edx
  @@:
 .nextxadd_l:
    mov       ecx,.texx_m ;TEX_X
    sub       ecx,eax
    std
    rep       movsd
    sub       esi,edx
    sub       edi,edx
    inc       ebx
    cmp       ebx,.texy_m ;TEX_Y
    jne       .nextxadd_l ;@b
 .end:
    mov       esp,ebp
    pop       ebp
ret
;=====================================================
apply_displacement_from_tex:
       .vert1_ptr equ dword[ebp-4]
       .vert2_ptr equ dword[ebp-8]
       .vert3_ptr equ dword[ebp-12]
       .fact      equ [ebp-16]
        push      ebp
        mov       ebp,esp
        sub       esp,16
        cvtsi2ss  xmm5,[displac_div]
        rcpss     xmm5,xmm5
        movss     .fact,xmm5
        prompt    prompt_displ
        xor       ecx,ecx
      .next_vert:
        push      ecx                      ; ecx - vertex counter
        mov       ebx,ecx
        imul      ebx,12
        mov       edi,ebx
        add       edi,[points_normals_rotated_ptr]
        add       ebx,[points_r_ptr]
        movups    xmm0,[ebx]
        mov       .vert1_ptr,ebx
        mov       esi,ecx
        mov       eax,ecx
        shl       eax,2
        add       eax,[tex_points_ptr]
        mov       ecx,eax
        movzx     eax,word[eax+2] ; tex y
        mov       edx,TEX_X
        mul       edx
        movzx     ecx,word[ecx]   ; tex x
        add       ecx,eax
        shl       ecx,2
        add       ecx,texmap
        mov       eax,ecx
        mov       ecx,[ecx]
        and       ecx,0x00ffffff
        cmp       ecx,[displ_transparent_col]  ;0x00ffffff
        je        .no  ; no change if transparent - white color
        cmp       [td_wp_flag],0  ; only vertices with positive normal z
                                  ; cooficient are computed or all vertices ?
        jz        .whole
        test      dword[edi+8],0x80000000   ; check sign of z coof
        jz        .no
      .whole:
        movzx     ecx,byte[eax]   ; read R,G,B pixel
        movzx     ebx,byte[eax+1]
        add       ecx,ebx
        movzx     ebx,byte[eax+2]
        add       ecx,ebx
        mov       eax,esi
        imul      eax,12
        add       eax,[points_normals_ptr]
        movups    xmm7,[eax]
        cvtsi2ss  xmm6,ecx
        mulss     xmm6,.fact
        shufps    xmm6,xmm6,0
        mulps     xmm7,xmm6
        addps     xmm0,xmm7
        mov       ebx,.vert1_ptr
        movlps    [ebx],xmm0
        movhlps   xmm0,xmm0
        movss     [ebx+8],xmm0
      .no:
        pop       ecx
        inc       ecx
        cmp       ecx,[points_count_var]
        jnz       .next_vert
        cls
        mov       esp,ebp
        pop       ebp
ret
;===========================================================
load_tetrahedron:

    mov    eax,'frea'
    call   free_mem_for_tp
    mov    edx,4000
    mov    [triangles_count_var],edx
    mov    [points_count_var],edx
    mov    eax,'alla'
    call   alloc_mem_for_tp

    mov    esi,tetra_points_r
    mov    edi,[points_r_ptr]
    cld
    mov    ecx,tetra_points_count*3 + 1
    rep    movsd
    mov    esi,tetra_triangles
    mov    edi,[triangles_ptr]
    mov    ecx,tetra_triangles_count*12 + 2
    cld
  @@:
    lodsb
    movzx  eax,al
    stosd
    loop   @b
    mov    [triangles_count_var],tetra_triangles_count
    mov    [points_count_var],tetra_points_count
ret
;================================================================
blur_screen:
;blur n times  ; blur or fire
;in - ecx  times count
         .counter1  equ dword[esp-8]
         push       ebp
         mov        ebp,esp
         .xres      equ [ebp-4]
         .yres      equ [ebp-8]
         movzx      eax,[xres_var]
         movzx      ebx,[yres_var]
         push       eax ebx
         movaps     xmm5,[blur_sub]
     .again_blur:
         push       ecx
         mov        edi,[screen_ptr]
         mov        ecx,.xres  ;SIZE_X
         xor        eax,eax
         rep        stosd
         mov        ecx,.xres  ;SIZE_X
         mov        ebx,.yres  ;SIZE_X
         sub        ebx,4
         imul       ecx,ebx
         shr        ecx,2
         mov        ebx,.xres  ;SIZE_X
         mov        esi,edi
         mov        edx,ebx
         shl        edx,2
         sub        esi,edx
     .blr:
     @@:
         movaps     xmm0,[edi+ebx*4]
         movaps     xmm2,[edi-16]
         pavgb      xmm0,[esi]
         pavgb      xmm2,[edi+16]
         pavgb      xmm0,xmm2
         psubusb    xmm0,xmm5  ; importand if fire
         movaps     [edi],xmm0
         add        edi,16
         add        esi,16
         loop       .blr
         xor        eax,eax
         mov        ecx,.xres ;SIZE_X
         rep        stosd
         pop        ecx
         loop       .again_blur
         mov        esp,ebp
         pop        ebp
ret
;===========================================================
normalize_object:  ; procedure don't save registers !!
; do mesh coords in from -1.0 to 1.0
; in:   vertices count to normalise    =  ecx
;       pointer to vertices            =  edi
;       if eax = 'ocen' -> search only for center and
;                          do not scale object
;       if eax = max    -> search only x y z max
; out:  xmm7 - broadcasted reciprocal of maxscale
;       xmm0 - center
;       xmm1 - max x, y, z
       .points_count equ ebp-40
       .points_r     equ ebp-44
       .xyz          equ [ebp-4]
       .maxxyz       equ [ebp-20]
       .center       equ [ebp-36]

       push     ebp
       mov      ebp,esp
       sub      esp,56
       mov      [.points_count],ecx
       or       ecx,ecx
       jz       .end
       dec      ecx
       mov      [.points_r],edi
       movups   xmm0,[edi]
       movups   xmm2,[edi]
     .bb1:
       movups   xmm1,[edi+12]
       maxps    xmm0,xmm1
       minps    xmm2,xmm1
       add      edi,12
       loop     .bb1
       movups   .maxxyz,xmm0
       cmp      eax,'max'
       je       .only_center
       movaps   xmm3,xmm0   ; scale
       subps    xmm0,xmm2
       mulps    xmm0,[f05x3]
       subps    xmm3,xmm0   ; xmm3 - this we sub from all
       movups   .center,xmm3
       movaps   xmm4,xmm0
       movaps   xmm5,xmm0
       shufps   xmm4,xmm4,11100001b
       shufps   xmm5,xmm5,11000110b
       maxps    xmm4,xmm0
       maxps    xmm4,xmm5    ; xmm4 - max scale
       shufps   xmm4,xmm4,0
       movaps   xmm6,xmm4
       rcpps    xmm4,xmm4
       cmp      eax,'ocen'
       jne      .f1
       mov      edx,1.0
       movd     xmm4,edx
       shufps   xmm4,xmm4,0   ; only_center:
     .f1:
       mov      ecx,[.points_count]
       mov      edi,[.points_r]
     .b2:
       movups   xmm0,[edi]
       subps    xmm0,xmm3    ; set 0,0,0 in center
       mulps    xmm0,xmm4    ; scale
       movhlps  xmm1,xmm0
       movlps   [edi],xmm0
       movss    [edi+8],xmm1
       add      edi,12
       loop     .b2
     .only_center:
     .end:
       movups   xmm1,.maxxyz
       movups   xmm3,.center
       ; movaps   xmm1,xmm0  ; xm1 = max x y z - if 'max' param
       movaps   xmm7,xmm4    ; xm2 = min x y z
       movaps   xmm0,xmm3
       ; xmm6 = max scale
       mov      esp,ebp    ; if not 'max' cause
       pop      ebp
ret
;=========================================================
generate_texture2:
        .const  equ 32
        cld
        mov     edi,texmap
        xor     bx,bx
     .next_line:
        xor     dx,dx
     .next2stripes:
        mov     eax,-1
        mov     ecx,(TEX_X/.const)
        rep     stosd
        mov     eax,0x00ff140a
        mov     ecx,(TEX_X/.const)
     @@:
        stosd
        loop    @b
        inc     dx
        cmp     dx,.const/2
        jl      .next2stripes
        inc     bx
        cmp     bx,TEX_Y
        jl      .next_line
        jmp     init_s_tex
;=========================================================
generate_texture1:  ; do blank texture uses for smooth objects edges
       mov  edi,texmap
       or   eax,-1
       mov  ecx,TEX_X*TEX_Y
       cld
       rep  stosd
       jmp  init_s_tex
;ret
;================================================================
generate_texture3:
       mov  edi,texmap
       xor  ecx,ecx
       cld
     .next:
       xor  ebx,ebx
     @@:
       mov  eax,ecx
       mov  edx,ebx
       xor  eax,edx
       mov  ah,al
       push ax
       shl  eax,8
       pop  ax
       stosd
       inc  ebx
       cmp  ebx,TEX_X
       jne  @b
       inc  ecx
       cmp  ecx,TEX_Y
       jne  .next
;ret
init_s_tex:
       mov  esi,texmap
       mov  edi,texmap_s
       mov  ecx,32
       cld
     @@:
       push ecx
       mov  ecx,32
       rep  movsd
       add  esi,(TEX_X - TEX_X_S - 4) * 4
       pop  ecx
       loop @b
ret
;=============================================================
do_mandel_tex:
    ; in:
    ; xmm0  - scale as float
    push          ebp
    mov           ebp,esp
    sub           esp,58
    mov           ecx,[CoresCount]
    add           ecx,3
    mov           eax,mandel_th
    xor           edx,edx
    ; mov           ecx,'max'
    call          call_thread
    movzx         ebx,word[xres_var]       ; copy to standard tex buffer
    mov           eax,ebx
    shl           ebx,2
    mov           esi,[new_tex_ptr]
    mov           edi,texmap
    mov           edx,TEX_Y
    cmp           eax,edx
    cmovb         edx,eax
  @@:
    mov           ecx,TEX_X
    cmp           eax,ecx
    cmovb         ecx,eax
    push          esi
    rep           movsd
    pop           esi
    add           esi,ebx
    dec           edx
    jnz           @b
    mov           esp,ebp
    pop           ebp
    jmp           init_s_tex
;ret
;===============================
mandel_th:
; in ebx - th no
    push          ebp
    mov           ebp,esp
    sub           esp,70
    .cc           equ dword[ebp-4]
    .unit         equ [ebp-8]
    .thno         equ [ebp-12]
    .ymin         equ [ebp-16]
    .ymax         equ [ebp-20]
    .x0sM         equ dword[ebp-24]
    .y0sM         equ [ebp-28]
    .yres         equ dword[ebp-32]  ; \
    .xres         equ dword[ebp-36]  ; /
    .x0M          equ [ebp-40]       ; \
    .y0M          equ [ebp-44]       ; /
    .pixxM        equ dword[ebp-48]  ; \
    .pixyM        equ dword[ebp-52]  ; |  >  don t change order
    .cons20       equ dword[ebp-56]  ; |  >
    .cons12       equ [ebp-60]       ; /
    .x            equ [ebp-64]       ; \
    .y            equ [ebp-68]       ; /
    movzx         eax,[yres_var]
    cdq
    mov           edi,[CoresCount]
    add           edi,3
    idiv          edi  ;[CoresCount]
    mov           .unit,eax
    mov           .thno,ebx
    imul          eax,ebx
    mov           .ymin,eax
    add           eax,.unit
    mov           .ymax,eax
    mov           eax,[xres_vard]
    movzx         ebx,ax
    shr           eax,16
    mov           .yres,eax
    mov           .xres,ebx
    movaps        xmm7,xmm0
    mov           .y0sM,dword 4000.571  ; scale vars
    mov           .x0sM,9000.287
    shufps        xmm7,xmm7,0
    movlps        xmm6,.y0sM
    movaps        xmm5,[f500x3]
    mulps         xmm5,xmm7
    addps         xmm6,xmm7
    rcpps         xmm6,xmm6
    movlps        .y0sM,xmm6
    movaps        xmm1,[margin]
    mulps         xmm1,xmm7
    mov           .cons20,0.48           ; x
    mov           .cons12,dword 1.12     ; y
    movaps        xmm0,[eps2]
    movaps        xmm1,xmm0
    addps         xmm0,xmm0
    addps         xmm0,xmm0
    mulps         xmm0,xmm7
    mov           eax,4
    movlps        xmm6,.cons12
    cvtsi2ss      xmm4,eax
    subps         xmm6,xmm0
    shufps        xmm4,xmm4,0
    movlps        .cons12,xmm6
    cld

;    movaps        xmm4,[const4]
    movaps        xmm1,[eps1]
    mov           eax,.ymin
    mov           .pixyM,eax
  .agyM:
    xor           eax,eax
    mov           .pixxM,eax
  .agxM:
    movups        xmm2,.cons12
    movaps        xmm5,xmm2
    movhlps       xmm2,xmm2
    cvtdq2ps      xmm2,xmm2  ; .pixyM
    movlps        xmm3,.y0sM
    mulps         xmm2,xmm3
    subps         xmm2,xmm5
    xor           eax,eax    ; iter
    xorps         xmm5,xmm5
  .nxxM:
    call          mandel
    imul          eax,10001
    mov           edi,.pixyM
    mov           ebx,.xres
    imul          edi,ebx
    add           edi,.pixxM
    ; and           edi,TEXTURE_SIZE
    shl           edi,2
    add           edi,[new_tex_ptr]
    cld
    stosd
    inc           .pixxM
    mov           eax,.ymax
    cmp           .pixxM,ebx
    jl            .agxM
    inc           .pixyM
    cmp           .pixyM,eax
    jl            .agyM
    mov           esp,ebp
    pop           ebp
ret
;===============================
mandel:
; in:
;  xm5 = x, y
;  xm2 = .y0M, .x0M
; out:
;  eax - col
    xor           ecx,ecx

    movaps        xmm0,xmm5 ; xm0 = period value
  .nxxM:                    ; xm5 = curr value
    movaps        xmm7,xmm5 ; xm7 = xm5 = .y, .x
    mulps         xmm7,xmm7
    movaps        xmm6,xmm7
    haddps        xmm7,xmm7 ; xm7 = x*x + y*y
    comiss        xmm7,xmm4 ; xm4 = 4.0
    ja            .set_pixM
    hsubps        xmm6,xmm6 ; xm6 = x*x - y*y
    movaps        xmm7,xmm5
    shufps        xmm7,xmm7,01010101b
    ; xmm7 = .x, .x, ..
    ; xmm5 = .y, .x, ..
    mulss         xmm7,xmm5 ; xm7 = x*y
    addps         xmm7,xmm7
    punpckldq     xmm7,xmm6 ; xm7 = x*y*2, x*x - y*y
    addps         xmm7,xmm2 ; xm2 = .y0M, .x0M
    movaps        xmm5,xmm7 ; xm5 = xm7 = x*y*2 + y0, x*x - y*y + x0
    movaps        xmm6,xmm0
    movaps        xmm3,xmm0
    addps         xmm3,xmm1 ; xm1 = epsilon
    subps         xmm6,xmm1
    cmpltps       xmm6,xmm7 ; check if .x and .y changed in last period
    cmpltps       xmm3,xmm7
    xorps         xmm3,xmm6
    movmskps      esi,xmm3
    and           esi,11b
    cmp           esi,11b
    je            .set_black
    inc           ecx
    cmp           ecx,20    ; period = 20
    jne           @f
    xor           ecx,ecx
    movaps        xmm0,xmm7
  @@:
    inc           eax
    cmp           eax,1000  ; max iterations = 1000
    jl            .nxxM
  .set_black:
    xor           eax,eax
  .set_pixM:
ret
;===========================================================
calc_bumpmap_coords:      ; map texture, bump
; in  = eax - scale as dword
      push         ebp
      mov          ebp,esp
      sub          esp,90
      and          ebp,-16
      .sc          equ       [ebp-4]
      .off1        equ dword [ebp-8]
      .off2        equ dword [ebp-12]
      .pi          equ       [ebp-16]
      .maxz        equ       [ebp-24]
      .maxy        equ       [ebp-28]
      .maxyz       equ       [ebp-32]
      .tex_y_div2  equ       [ebp-38]
      .tex_x_div2  equ       [ebp-42]
      .tex_y_div2f equ       [ebp-46]
      .tex_x_div2f equ       [ebp-50]
      .atan        equ dword [ebp-54]
      .atan1       equ dword [ebp-58]
      .atan2       equ dword [ebp-62]
  ;    .zero_hgst   equ       [ebp-80]
      .pcv         equ       [ebp-66]
      .pts         equ       [ebp-70]
      mov        .sc,eax
      movups     xmm0,[tex_x_div2f]
      movups     .tex_x_div2f,xmm0
  ;    movaps     xmm2,[zero_hgst]
  ;    movaps     .zero_hgst,xmm2
      mov        esi,[points_r_ptr]
      mov        edi,[tex_points_ptr]
      mov        edx,[tex_points_f_ptr]
      mov        ecx,[points_count_var]
      mov        .pcv,ecx
      mov        .pts,esi
      cmp        [map_flag],2
      jg         .spherical
       ; choose map planar or spherical
       ; map flag 1,2,3 - various planar mapping
      cmp        [map_flag],0
      jne        @f
      xor        eax,eax
      mov        ebx,4
      jmp        .do_planar
  @@:
      cmp        [map_flag],1
      jne        @f
      mov        eax,4
      mov        ebx,8
      jmp        .do_planar
  @@:
      mov        eax,8
      mov        ebx,4
   .do_planar:
      mov       .off1,eax
      mov       .off2,ebx
      movups    xmm1,.tex_x_div2f
      movups    xmm2,.sc
      mulps     xmm1,xmm2
      movups    xmm3,.tex_x_div2
      shufps    xmm1,xmm1,0
      shufps    xmm3,xmm3,0
      cvtdq2ps  xmm3,xmm3
   .do:
      mov       eax,.off1
      mov       ebx,.off2
      movlps    xmm0,[esi+eax]
      movhps    xmm0,[esi+ebx]
      mulps     xmm0,xmm1
      addps     xmm0,xmm3
      shufps    xmm0,xmm0,11111000b
      movlps    [edx],xmm0
      cvtps2dq  xmm0,xmm0
      packssdw  xmm0,xmm0
      movss     [edi],xmm0
      add       esi,12
      add       edi,4
      add       edx,8
      loop      .do  ;.do_planar
      jmp       .end
;;  Owinicia sferyczne wzgldem osi:
;;  X) tu=Su/2*pi arctg(z/y)-ov tv=(Sv/pi ) * arctg(x/sqrt(x^2+y^2+z^2) - ov
;;  Y) tu=Su/2*pi arctg(x/z)-ov tv=(Sv/pi ) * arctg(y/sqrt(x^2+y^2+z^2) - ov
;;  Z) tu=Su/2*pi arctg(x/y)-ov tv=(Sv/pi ) * arctg(z/sqrt(x^2+y^2+z^2) - ov
;;  Gdzie Su, Sv - parametry skalujce odpowiednie wsprzdne tekstury;
;;  ou, ov - wsprzdne wyznaczajce rodek tekstury.
   .spherical:
      ; spherical mapping around y axle
      mov       .pi, dword 3.141592653
      ; find max x, y, z from points_r to scale properly
      pushad
      mov       eax,'max'        ; find max x, y, z
      mov       edi,  dword .pts
      mov       ecx,  dword .pcv
      call      normalize_object
      popad
      movups    .maxyz,xmm1
   @@:
      movlps    xmm0,[esi]
      movaps    xmm1,xmm0
      movaps    xmm2,xmm0
      divss     xmm2,[esi+4]
    ;  mulps     xmm0,xmm0
    ;  andps     xmm0,.zero_hgst
    ;  haddps    xmm0,xmm0
    ;  haddps    xmm0,xmm0
      dpps      xmm0,xmm0,01110111b
      rsqrtps   xmm0,xmm0
      mulps     xmm0,xmm1
      punpckldq xmm0,xmm2
      call      sin_cos
      movlps    xmm1,.pi
      rcpps     xmm7,xmm1
      addps     xmm1,xmm1
      rcpps     xmm1,xmm1
      movlps    xmm6,.sc
      movaps    xmm1,xmm6
      divss     xmm1,.maxz

      movhlps   xmm4,xmm0
      mulps     xmm4,xmm1
      mulss     xmm6,xmm7
      mulss     xmm6,.maxyz
      shufps    xmm0,xmm0,11111111b
      mulps     xmm0,xmm6
      punpckldq xmm4,xmm0

      movlps    [edx],xmm4
      cvtdq2ps  xmm4,xmm4
      packssdw  xmm4,xmm4
      movss     [edi],xmm4
      add       esi,12
      add       edi,4
      add       edx,8
      dec       ecx
      jnz       @b
  ;    loop      @b
   .end:
      add       esp,90
      pop       ebp
ret
;================================================================
calc_bumpmap:            ; calculate random bumpmap
;--------------in edi _ pointer to TEX_X x TEX_Y bumpmap
         pushad
         mov     edi,bump_map
         mov     esi,texmap
         mov     ecx,TEXTURE_SIZE
         cmp     [bump_flag],0
         je      .random_bump_map
         ; else bumps according to texture
      @@:
         movzx   ax,byte[esi]
         movzx   bx,byte[esi+1]
         movzx   dx,byte[esi+2]
         add     ax,bx
         add     ax,dx
         cwde
         imul    [irecipr3]
         xchg    eax,edx
         stosb
         add      esi,4
         loop     @b
         jmp     .blur_map
  .random_bump_map:
       @@:
         push    ecx
         xor     ecx,ecx
         mov     edx,255
         call    random
         stosb
         pop     ecx
         loop    @b
      .blur_map:
      ;   movzx  ecx,[bumps_deep_flag]
         mov    edi, bump_map
         mov    ecx,4
      .blur:
         xor    esi,esi
         mov    edx,TEXTURE_SIZE
         xor    eax,eax
         xor    ebx,ebx
      @@:
         mov    ebp,esi
         dec    ebp
         push   edx
         mov    edx,TEXTURE_SIZE
         and    ebp,edx
         mov    al,byte[ebp+edi]
         mov    ebp,esi
         inc    ebp
         and    ebp,edx
         mov    bl,byte[ebp+edi]
         add    eax,ebx
         mov    ebp,esi
         sub    ebp,TEX_X
         and    ebp,edx
         mov    bl,byte[ebp+edi]
         add    eax,ebx
         mov    ebp,esi
         add    ebp,TEX_X
         and    ebp,edx
         mov    bl,byte[ebp+edi]
         add    eax,ebx
         shr    eax,2
         mov    byte[esi+edi],al
         inc    esi
         pop    edx
         dec    edx
         jnz    @b
         loop   .blur
         popad
ret
;================================================================
make_random_lights:
  .temp1  equ [ebp-4]
  .temp2  equ [ebp-8]     ;  - light vector generate variables
  .temp3  equ [ebp-12]
  .col1   equ ebp-20
  .col2   equ ebp-28
  .col3   equ ebp-32
  .max    equ 800
  .max1   equ 1000
        push      ebp
        mov       ebp,esp
        sub       esp,32
        mov       edi,lights1
        mov       .temp1, dword 1000.0
        mov       dword .temp2,800.0
        mov       dword .temp3,400.0
        rcpss     xmm3,.temp3
        rcpss     xmm2,.temp2
    .again:
        xor       esi,esi
     @@:
        mov       edx,.max
        xor       ecx,ecx
        call      random
        sub       eax,.max/2
        cvtsi2ss  xmm1,eax
        mulss     xmm1,xmm3
        movss     [edi+esi*4],xmm1
        inc       esi
        cmp       esi,3
        jne       @b
        mov       edx,.max1
        xor       ecx,ecx
        call      random
        cvtsi2ss  xmm4,eax
        xorps     xmm4,[sign_mask]
        mulss     xmm4,xmm2
        movss     [edi+8],xmm4
        xor       esi,esi
     @@:
        mov       ecx,230            ; max colors and shine  ,
                                     ; ecx = 200 - more bright shading
        mov       edx,255
        call      random
        movzx     ax,al
        mov       [edi+28+esi*2],ax
        inc       esi
        cmp       esi,4
        jne       @b
        xor       esi,esi
     @@:
        mov       ecx,70              ; orginal colors
        movzx     edx,word[edi+28+esi*2]
        call      random
        movzx     ax,al
        mov       [edi+12+esi*2],ax
        inc       esi
        cmp       esi,3
        jne       @b
        xor       esi,esi
     @@:
        mov       ecx,1               ; min cols
        movzx     edx,word[edi+12+esi*2]
        call      random
        movzx     ax,al
        mov       [edi+20+esi*2],ax
        inc       esi
        cmp       esi,3
        jne       @b
        add       edi,LIGHT_SIZE ;22
        cmp       edi,lightsend
        jne       .again

        mov       edi,lights_aligned
        xorps     xmm5,xmm5
        movaps    xmm6,[zero_hgst]
        mov       esi,0x80000000
        xorps     xmm7,xmm7
        movd      xmm7,esi
        shufps    xmm7,xmm7,11001111b
        mov       esi,lights1
    ;    movaps    xmm7,[z_minus_mask]
     .do_aligned: ; make aligned float lights used
                  ; in all Real Phong rendering models
        movups    xmm0,[esi]
        orps      xmm0,xmm7   ;[z_minus_mask]
        movaps    xmm1,xmm0   ; normalize
    ;    mulps     xmm1,xmm1
    ;    andps     xmm1,xmm6   ;[zero_hgst_dd]
    ;    haddps    xmm1,xmm1
    ;    haddps    xmm1,xmm1
        dpps      xmm1,xmm1,01110111b
        rsqrtps   xmm1,xmm1
        mulps     xmm0,xmm1
        movlps    xmm1,[esi+12]
        movlps    xmm2,[esi+12+8]
        movlps    xmm3,[esi+12+16]
        punpcklwd xmm1,xmm5
        punpcklwd xmm2,xmm5
        punpcklwd xmm3,xmm5
        psrld     xmm1,1
        psrld     xmm2,1
        andps     xmm0,xmm6
        cvtdq2ps  xmm1,xmm1
        cvtdq2ps  xmm2,xmm2
        cvtdq2ps  xmm3,xmm3
        andps     xmm1,xmm6
        andps     xmm2,xmm6
        andps     xmm3,xmm6
        movaps    [edi],xmm0
        movaps    [edi+16],xmm1
        movaps    [edi+32],xmm2
        movaps    [edi+48],xmm3
        add       edi,64
        add       esi,LIGHT_SIZE
        cmp       esi,lightsend
        jne       .do_aligned
        mov       esp,ebp
        pop       ebp
ret
;=======================================================
init_point_lights:
        mov       eax,-1000
        cvtsi2ss  xmm1,eax
        shufps    xmm1,xmm1,11000000b
        mov       esi,lights_aligned
        mov       edi,point_light_coords
        mov       ecx,3
      @@:
        movaps    xmm0,[esi]
        mulps     xmm0,[f05x3]
        mulps     xmm0,xmm1
 ;      addps     xmm0,xmm1
        movaps    [edi],xmm0
        add       esi,64
        add       edi,16
        loop      @b
ret
;=============================================================
random:
;  in  - ecx - min
;        edx - max
;  out - eax - random number
     ;    cmp    [isCUDA],0
     ;    jne    .ran_cu
         mov    ebx,[rand_seed]
         imul   ebx,1000001
         ror    ebx,16
         mov    [rand_seed],ebx
         mov    eax,edx
         sub    eax,ecx
         mul    ebx
         mov    eax,edx
         add    eax,ecx
ret
;.ran_cu:
;include 'cudacode.inc'
;=================================================================
do_color_buffer:         ; do color buffer for Gouraud, flat shading
;env_map 512 x 512 x 3 bytes    ; many lights using
.temp   equ dword  [ebp-22]
.nz     equ dword  [ebp-6]  ; dword
.ny     equ dword  [ebp-10]
.nx     equ        [ebp-14]
.col_r  equ        [ebp-16]
.col_g  equ        [ebp-17]
.col_b  equ        [ebp-18]
.xx     equ        [ebp-26]
.yy     equ        [ebp-30]
.zz     equ        [ebp-34]
;.colf  equ        [ebp-50]
         push      ebp
         mov       ebp,esp
         sub       esp,40
         mov       edi,color_map
         cvtpi2ps  xmm3,[tex_x_div2]
         rcpps     xmm3,xmm3
         shufps    xmm3,xmm3,11100001b
         movlps    .yy,xmm3
         mov       edx,- TEX_Y / 2 ;-256   ; dx - vertical coordinate = y
    .ie_ver:
         mov       ecx,- TEX_X / 2 ;256   ; cx - horizontal coord = x
    .ie_hor:

         push      ecx edx
         cvtpi2ps  xmm2,[esp]
         movlps    xmm1,.yy
         mulps     xmm2,xmm1
         movlps    .nx,xmm2
         mulps     xmm2,xmm2
         pcmpeqd   xmm7,xmm7
         haddps    xmm2,xmm2
         add       esp,8
         psrld     xmm7,1

         subps     xmm2,[the_one]
         andps     xmm2,xmm7 ;[abs_mask]
         sqrtps    xmm2,xmm2
         xorps     xmm2,[sign_mask]
         movss     .nz,xmm2
         xor       ebx,ebx
         mov       dword .col_b, ebx
    .light:
        push       edi   ;env_map
        lea        esi,[lights1+ebx]
        lea        edi,.nx
        call       dot_product
        ; out xmm0 - dot
        pop        edi
        xorps      xmm2,xmm2
        minps      xmm0,[the_one]
        maxps      xmm0,xmm2
;    .env_ok2:
;       [esi+36]   ; -  shines
        movlps     xmm4,[esi+20]  ;.min_col_r
        movlps     xmm1,[esi+28]  ;.max_col_r
        movlps     xmm2,[esi+12]  ;.org_col_r
        call       calc_one_col
        ; eax-0x00rrggbb
        movd       xmm0,eax
        movlps     xmm1,.col_b
        pmaxub     xmm1,xmm0
        movss      .col_b,xmm1
    .update_counters:                             ; update and jump when neccesery
        add        ebx,LIGHT_SIZE
        cmp        bx,all_lights_size1
        jl         .light    ; next_light
        mov        eax,.col_b
        or         eax,0xff000000
        stosd
        inc        ecx
        cmp        ecx,TEX_X / 2
        jne        .ie_hor
        inc        edx
        cmp        edx,TEX_Y / 2
        jne        .ie_ver
    .env_done:
        mov        esp,ebp
        pop        ebp
ret
;============================================================
init_envmap2:   ; do env_map
;env_map 512 x 512 x 4 bytes
        .col_r equ  [ebp-8]
        .col_g equ  [ebp-9]
        .col_b equ  [ebp-10]
        .yy    equ  [ebp-18]
        .xx    equ  [ebp-22]
        .cxxx  equ  [ebp-26]
        .dxxx  equ  [ebp-30]
         push     ebp
         mov      ebp,esp
         sub      esp,34
         mov      edi,envmap
         cvtpi2ps xmm3,[tex_x_div2]
         movlps   .xx,xmm3
         mov      edx,- TEX_Y / 2 ;256   ; dx - vertical coordinate = y
    .ie_ver:
         push     edx
         mov      ecx,- TEX_X / 2 ;256   ; cx - horizontal coord = x
    .ie_hor:
         push     ecx
         xor      ebx,ebx
         mov      dword .col_b,ebx
     .light:
         push     ebx
         lea      esi,[lights1+ebx]
         mov      .cxxx,edx
         mov      .dxxx,ecx
         movlps   xmm0,[esi]
         movlps   xmm2,.xx
         cvtpi2ps xmm3,.dxxx
         mulps    xmm0,xmm2
         subps    xmm3,xmm0
         mulps    xmm3,xmm3
         haddps   xmm3,xmm3
         movlps   xmm0,[f255d]
         sqrtps   xmm3,xmm3
         rcpps    xmm5,xmm0
         subps    xmm0,xmm3
         mulss    xmm0,[env_const]
         mulps    xmm0,xmm5
         xorps    xmm1,xmm1
         maxps    xmm0,xmm1
         minps    xmm0,[the_one]
;        push     word[esi+36]   ; -  shines
         movlps   xmm4,[esi+20]  ;.min_col_r
         movlps   xmm1,[esi+28]  ;.max_col_r
         movlps   xmm2,[esi+12]  ;.org_col_r
         call     calc_one_col
         movd     xmm0,eax
         movlps   xmm1,.col_b
         pmaxub   xmm1,xmm0
         movss    .col_b,xmm1
   .update_counters:                     ; update and jump when neccesery
         pop      ebx
         add      ebx,LIGHT_SIZE
         cmp      bx,all_lights_size1
         jl       .light    ; next_light
         mov      eax,.col_b
         or       eax,0xff000000
         stosd
         pop      ecx
         inc      ecx
         cmp      ecx,TEX_X / 2 ;256
         jne      .ie_hor
         pop      edx
         inc      edx
         cmp      edx,TEX_Y / 2 ;256
         jne     .ie_ver
         mov      esp,ebp
         pop      ebp
ret
;=========================================================
 calc_one_col:
; procedure don't save registers !!!
; in -   xmm0   - dot_product
;        xmm4, - .min_col bgr  - words  ; minimum color - ambient
;        xmm1, - .max_col bgr           ; maximum color - specular
;        xmm2, - .org_col bgr           ; orginal color - diffuse
                                        ; shines - not implemented
; out -  eax - 0x00rrggbb
; color = ambient+cos(x)*diffuse+(cos(x)^n)*specular
         movaps    xmm5,xmm4
         paddw     xmm4,xmm1
         paddw     xmm4,xmm2
         shufps    xmm0,xmm0,0
         movaps    xmm7,xmm0
         mulps     xmm0,xmm0
         xorps     xmm3,xmm3
         mulps     xmm0,xmm0
         mulps     xmm0,xmm0
         mulps     xmm0,xmm0
         punpcklwd xmm5,xmm3  ; min
         punpcklwd xmm1,xmm3  ; max
         punpcklwd xmm2,xmm3  ; org
         punpcklwd xmm4,xmm3  ; sum
         cvtdq2ps  xmm5,xmm5
         cvtdq2ps  xmm1,xmm1
         cvtdq2ps  xmm2,xmm2
         cvtdq2ps  xmm4,xmm4
         movaps    xmm6,xmm1
         mulps     xmm1,xmm0  ; max * dot^n
         mulps     xmm2,xmm7  ; org * dot
         addps     xmm1,xmm2
         rcpps     xmm4,xmm4
         addps     xmm1,xmm5  ; + min
         mulps     xmm1,xmm6  ; * max
         mulps     xmm1,xmm4  ; / sum
         cvtps2dq  xmm1,xmm1
         packssdw  xmm1,xmm1
         packuswb  xmm1,xmm1
         movd      eax,xmm1
ret
